Keskustelu:Valtion konesalistrategia 2014
Kohteesta Opasnet Suomi
Obs | Versio | Sivu | Ident | Result |
---|---|---|---|---|
1 | 1 | Valtion konesalistrategia 2014 | Op_fi4724 |
Tietokannan alustus
library(OpasnetUtils) out <- data.frame( Versio = 0, Sivu = NA, Aika = date(), Käyttäjä = wiki_username, Syy = "Alustus", Result = NA ) opbase.upload( out, name = pagename, who = wiki_username, subset = "Jakeenpoistot" ) out <- data.frame( Versio = 0, Sivu = NA, Aika = date(), Käyttäjä = wiki_username, JaeID = NA, Result = "Alustus" ) opbase.upload( out, name = pagename, who = wiki_username, subset = "Jakeenlisaykset" ) out <- data.frame( AvainsanaID = NA, Versio = 0, Aika = date(), Käyttäjä = wiki_username, Sivu = NA, JaeID = NA, Result = "Alustus" ) opbase.upload( out, name = pagename, who = wiki_username, subset = "Avainsanat" ) out <- data.frame( RelaatioID = NA, Versio = 0, Aika = date(), Käyttäjä = wiki_username, Sivu1 = NA, JaeID1 = NA, Sivu2 = NA, JaeID2 = NA, Result = "Alustus" ) opbase.upload( out, name = pagename, who = wiki_username, subset = "Relaatiot" ) out <- data.frame( Versio = 0, Aika = date(), Käyttäjä = wiki_username, Syy = "Alustus", Result = NA ) opbase.upload( out, name = pagename, subset = "Relaationpoistot", who = wiki_username ) out <- data.frame( Versio = 0, Aika = date(), Käyttäjä = wiki_username, Syy = "Alustus", Result = NA ) opbase.upload( out, name = pagename, subset = "Avainsananpoistot", who = wiki_username ) |
Automaattitägitys
library(OpasnetUtils) objects.latest("Op_fi4325", "apufunktiot") parse_page <- function(url, url_args = "", nchar_threshold = 11) { turl <- paste(url, url_args, "&action=render", sep = "") a <- opasnet.page(turl, wiki = "opasnet_fi") par <- gregexpr("<p>(.*?)</p>", a)[[1]] par_out <- substr(rep(a, length(par)), par, par + attributes(par)$match.length) par_out <- gsub("<p>", "", par_out) par_out <- gsub("</p>", "", par_out) table <- gregexpr("<table[^>]*>(.*?)</table>", a)[[1]] td <- gregexpr("<td[^>]*>(.*?)</td>", a)[[1]] #th <- gregexpr("<th[^>]*>(.*?)</th>", a)[[1]] table_out <- substr(rep(a, length(td)), td, td + attributes(td)$match.length) table_out <- gsub("<td[^>]*>", "", table_out) table_out <- gsub("</td>", "", table_out) # Use paragraphs in table cells to break content (glitchy?) # At least it removes duplicates of <p> (due to matching <p> search as well as <td>) tdppos <- gregexpr("<p>(.*?)</p>", table_out) table_out <- gsub("<p>(.*?)</p>", "<super_separator>", table_out) tdp <- strsplit(table_out, "<super_separator>") tdpl <- sapply(tdp, length) temp <- list() # Find begin positions of <p> separated bits in <td> for (i in (1:length(tdp))[tdpl > 1]) { temp[[i]] <- c(0, tdppos[[i]] + attributes(tdppos[[i]])$match.length) + td[i] } # Filter <td> with <p> filter <- logical() for (i in 1:length(tdp)) { if (tdpl[i] == 1) val <- FALSE else val <- TRUE filter <- c(filter, rep(val, tdpl[i])) } tdp <- unlist(tdp)[filter] filter <- -(1:length(tdpl))[tdpl > 1] # Filter out table_out <- table_out[filter] td <- td[filter] # Add new bits and positions to end of filtered original table_out <- c(table_out, tdp) td <- c(td, unlist(temp)) li <- gregexpr("<li>(.*?)</li>", a)[[1]] li_out <- substr(rep(a, length(li)), li, li + attributes(li)$match.length) li_out <- gsub("<li>", "", li_out) li_out <- gsub("</li>", "", li_out) dd <- gregexpr("<dd>(.*?)</dd>", a)[[1]] dd_out <- substr(rep(a, length(dd)), dd, dd + attributes(dd)$match.length) dd_out <- gsub("<dd>", "", dd_out) dd_out <- gsub("</dd>", "", dd_out) out <- c(par_out, table_out, li_out, dd_out) pos <- order(c(par, td, li, dd)) out <- out[pos] pos <- c(par, td, li, dd)[pos] filter <- nchar(out) > nchar_threshold out <- out[filter] pos <- pos[filter] # Level 2 header = Topic h2 <- gregexpr("<h2>(.*?)</h2>", a)[[1]] h2_out <- substr(rep(a, length(par)), h2, h2 + attributes(h2)$match.length) h2_out <- gsub("<[^>]*>", "", h2_out) out <- data.frame( Aihe = NA, JaeID = as.character(1:length(out)), Result = out ) for (i in 1:length(h2)) { filter2 <- pos < c(h2, Inf)[i + 1] & pos > h2[i] out$Aihe[filter2] <- h2_out[i] } return(out) } versiot <- get_meta_data(ident) version <- max(as.numeric(as.character(versiot$Versio))) names <- as.character(versiot$Sivu[versiot$Versio == version]) url <- gsub(" ", "_", names) old_id <- versiot$Result old_id <- paste("&oldid=", old_id, sep = "") out <- data.frame() for (i in 1:length(url)) { temp <- parse_page(url[i], old_id[i]) temp$Sivu <- names[i] temp$Versio <- versiot$Versio[i] out <- rbind(out, temp) } #out$Versio <- version out <- out[c("Versio", "Sivu", "Aihe", "JaeID", "Result")] opbase.upload( out, name = pagename, who = wiki_username, subset = "Jakeet" ) |