Ero sivun ”Tägikone” versioiden välillä
Siirry navigaatioon
Siirry hakuun
(uusi versio) |
(→Apufunktiot: jakeistajan resoluutio lausetasolle uudella parametrilla) |
||
| (15 välissä olevaa versiota samalta käyttäjältä ei näytetä) | |||
| Rivi 2: | Rivi 2: | ||
== Apufunktiot == | == Apufunktiot == | ||
<rcode label='Alusta funktiot' name='apufunktiot' embed=1> | '''Vaati plyr kirjaston''' | ||
<rcode label='Alusta funktiot' name='apufunktiot' embed=1 showcode=1> | |||
library(OpasnetUtils) | library(OpasnetUtils) | ||
# Requires plyr | |||
get_page_ident <- function(main_ident, page_name) { | get_page_ident <- function(main_ident, page_name) { | ||
| Rivi 17: | Rivi 20: | ||
out <- max(as.numeric(as.character(ver$Versio))) | out <- max(as.numeric(as.character(ver$Versio))) | ||
return(out) | return(out) | ||
} | |||
get_meta_data <- function(main_ident, page_name = NULL) { | |||
filter <- list() | |||
filter$Sivu <- page_name | |||
ver <- opbase.data(main_ident, subset = "Versiot", include = filter) | |||
return(ver) | |||
} | } | ||
| Rivi 26: | Rivi 36: | ||
opbase.data( | opbase.data( | ||
i, | i, | ||
include = filter | include = filter[names(filter) %in% opbase.indices(i)] | ||
), | ), | ||
error = function(...) return(data.frame()) | error = function(...) return(data.frame()) | ||
| Rivi 36: | Rivi 46: | ||
first <- FALSE | first <- FALSE | ||
} else { | } else { | ||
out <- rbind(out, temp[colnames(out)]) | out <- rbind.fill(out, temp[colnames(temp) %in% colnames(out)]) | ||
} | } | ||
} | } | ||
} | } | ||
if (!is.null(rm_id)) { | if (!is.null(rm_id)) { | ||
#filter[[rm_ind]] <- NULL | |||
#if (length(filter) == 0) filter <- NULL | |||
temp <- tryCatch( | temp <- tryCatch( | ||
opbase.data( | opbase.data( | ||
rm_id, | rm_id, | ||
include = filter | include = filter[names(filter) %in% opbase.indices(rm_id)] | ||
), | ), | ||
error = function(...) return(NULL) | error = function(...) return(NULL) | ||
| Rivi 55: | Rivi 67: | ||
temp <- temp[colnames(temp)[colnames(temp) %in% c(rm_ind, "Sivu", "Versio")]] | temp <- temp[colnames(temp)[colnames(temp) %in% c(rm_ind, "Sivu", "Versio")]] | ||
temp$Remove_tagged <- 1 | temp$Remove_tagged <- 1 | ||
out <- | out <- join(out, temp, match = "first") # left join from plyr, preserves original order, first match is faster | ||
out <- out[is.na(out$Remove_tagged),] | out <- out[is.na(out$Remove_tagged),] | ||
out[["Remove_tagged"]] <- NULL | out[["Remove_tagged"]] <- NULL | ||
| Rivi 63: | Rivi 75: | ||
} | } | ||
upload_with_autoid <- function(data, ident.subset, id_name, pagename, prefix = character(), filter = NULL) { | |||
upload_with_autoid <- function(data, ident.subset, id_name, prefix = character(), filter = NULL) { | |||
# Bad implementation that downloads whole data just to find next id | # Bad implementation that downloads whole data just to find next id | ||
temp <- tryCatch( | temp <- tryCatch( | ||
opbase.data(ident.subset, include = filter), | |||
error = function(...) return(NULL) | |||
) | ) | ||
if (!is.null(temp)) { | if (!is.null(temp)) { | ||
| Rivi 77: | Rivi 88: | ||
} | } | ||
if (nrow(data) > 1) obs <- obs:(obs + nrow(data) - 1) | if (nrow(data) > 1) obs <- obs:(obs + nrow(data) - 1) | ||
data[[id_name]] <- obs | data[[id_name]] <- paste(prefix, obs, sep = "") | ||
ident.subset <- strsplit(ident.subset, ".", fixed = TRUE)[[1]] | ident.subset <- strsplit(ident.subset, ".", fixed = TRUE)[[1]] | ||
opbase.upload( | opbase.upload( | ||
data, | |||
ident = ident.subset[1], | |||
name = pagename, # needs rcode tag variables | |||
subset = ident.subset[2], | |||
act_type = "append", | |||
#language = "fin", | |||
who = wiki_username | |||
) | ) | ||
#return(data) | |||
} | } | ||
| Rivi 99: | Rivi 111: | ||
out <- try_dl_rm( | out <- try_dl_rm( | ||
paste(main_ident, "Jakeet", sep = "."), | paste(main_ident, "Jakeet", sep = "."), | ||
paste(main_ident, " | paste(main_ident, "Jakeenlisaykset", sep = "."), | ||
filter = filter, | filter = filter, | ||
rm_id = paste(main_ident, "Jakeenpoistot", sep = "."), | rm_id = paste(main_ident, "Jakeenpoistot", sep = "."), | ||
| Rivi 105: | Rivi 117: | ||
) | ) | ||
meta_data <- get_meta_data(main_ident, sivu) | |||
ids <- as.character(unique(meta_data$Ident)) | |||
names <- as.character(unique(meta_data$Sivu)) | |||
# If NULL matches all pages listed in Versiot | # If NULL matches all pages listed in Versiot | ||
for (i in ids) { | if (length(ids) > 0) { | ||
for (i in 1:length(ids)) { | |||
temp <- try_dl_rm( | |||
paste(ids[i], "Kommentit", sep = "."), | |||
filter = filter, | |||
rm_id = paste(ids[i], "Kommentinpoistot", sep = "."), | |||
rm_ind = "JaeID" | |||
) | |||
if (nrow(temp) > 0) { | |||
temp$Sivu <- names[i] | |||
if(nrow(out) > 0) { | |||
for (i in colnames(out)[!colnames(out) %in% colnames(temp)]) { | |||
temp[[i]] <- NA | |||
} | |||
out <- rbind(out, temp[colnames(out)]) | |||
} else { | |||
out <- temp | |||
} | |||
} | |||
} | |||
} | |||
return(out) | |||
} | |||
parse_page <- function(url, url_args = "", nchar_threshold = 11, res = "par") { | |||
turl <- paste(url, url_args, "&action=render", sep = "") | |||
a <- opasnet.page(turl, wiki = "opasnet_fi") | |||
par <- gregexpr("<p>(.*?)</p>", a)[[1]] | |||
par_out <- substr(rep(a, length(par)), par, par + attributes(par)$match.length) | |||
par_out <- gsub("<p>", "", par_out) | |||
par_out <- gsub("</p>", "", par_out) | |||
table <- gregexpr("<table[^>]*>(.*?)</table>", a)[[1]] | |||
td <- gregexpr("<td[^>]*>(.*?)</td>", a)[[1]] | |||
#th <- gregexpr("<th[^>]*>(.*?)</th>", a)[[1]] | |||
table_out <- substr(rep(a, length(td)), td, td + attributes(td)$match.length) | |||
table_out <- gsub("<td[^>]*>", "", table_out) | |||
table_out <- gsub("</td>", "", table_out) | |||
# Use paragraphs in table cells to break content (glitchy?) | |||
# At least it removes duplicates of <p> (due to matching <p> search as well as <td>) | |||
tdppos <- gregexpr("<p>(.*?)</p>", table_out) | |||
table_out <- gsub("<p>(.*?)</p>", "<super_separator>", table_out) | |||
tdp <- strsplit(table_out, "<super_separator>") | |||
tdpl <- sapply(tdp, length) | |||
temp <- list() | |||
# Find begin positions of <p> separated bits in <td> | |||
for (i in (1:length(tdp))[tdpl > 1]) { | |||
temp[[i]] <- c(0, tdppos[[i]] + attributes(tdppos[[i]])$match.length) + td[i] | |||
} | |||
# Filter <td> with <p> | |||
filter <- logical() | |||
for (i in 1:length(tdp)) { | |||
if (tdpl[i] == 1) val <- FALSE else val <- TRUE | |||
filter <- c(filter, rep(val, tdpl[i])) | |||
} | |||
tdp <- unlist(tdp)[filter] | |||
filter <- -(1:length(tdpl))[tdpl > 1] | |||
# Filter out | |||
table_out <- table_out[filter] | |||
td <- td[filter] | |||
# Add new bits and positions to end of filtered original | |||
table_out <- c(table_out, tdp) | |||
td <- c(td, unlist(temp)) | |||
li <- gregexpr("<li>(.*?)</li>", a)[[1]] | |||
li_out <- substr(rep(a, length(li)), li, li + attributes(li)$match.length) | |||
li_out <- gsub("<li>", "", li_out) | |||
li_out <- gsub("</li>", "", li_out) | |||
dd <- gregexpr("<dd>(.*?)</dd>", a)[[1]] | |||
dd_out <- substr(rep(a, length(dd)), dd, dd + attributes(dd)$match.length) | |||
dd_out <- gsub("<dd>", "", dd_out) | |||
dd_out <- gsub("</dd>", "", dd_out) | |||
out <- c(par_out, table_out, li_out, dd_out) | |||
pos <- order(c(par, td, li, dd)) | |||
out <- out[pos] | |||
pos <- c(par, td, li, dd)[pos] | |||
filter <- nchar(out) > nchar_threshold | |||
out <- out[filter] | |||
pos <- pos[filter] | |||
# Resolution beyond paragraph | |||
if (res == "sentence") { | |||
per <- gregexpr("\\.|\\?|!", out) | |||
per_out <- list() | |||
for (i in 1:length(out)) { | |||
per_out[[i]] <- substr( | |||
rep(out[i], length(per[[i]]) + 1), | |||
c(0, per[[i]]) + 1, | |||
c(per[[i]], nchar(out[i])) | |||
) | |||
j <- 2 | |||
while (j < length(per_out[[i]]) + 1) { | |||
if (nchar(gsub("\\s", "", per_out[[i]][j])) < 4) { | |||
per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "") | |||
per_out[[i]] <- per_out[[i]][-j] | |||
if (j - 1 < length(per_out[[i]])) { | |||
per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "") | |||
per_out[[i]] <- per_out[[i]][-j] | |||
} | |||
} else { | |||
j <- j + 1 | |||
} | |||
} | |||
#j <- length(per_out[[i]]) | |||
#while (j > 0) { | |||
# if (nchar(gsub("\\s", "", per_out[[i]][j])) < nchar_threshold) { | |||
# per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "") | |||
# per_out[[i]] <- per_out[[i]][-j] | |||
# } else { | |||
# j <- j - 1 | |||
# } | |||
#} | |||
} | |||
out <- unlist(per_out) | |||
pos <- rep(pos, sapply(per_out, length)) | |||
filter <- nchar(out) > nchar_threshold | |||
out <- out[filter] | |||
pos <- pos[filter] | |||
out <- gsub("^\\s*|\\s*$", "", out) | |||
out <- gsub("\\n", " ", out) | |||
# Advanced natural language processing not available for finnish language | |||
#require(NLP) | |||
#require(openNLP) | |||
#s <- as.String(s) | |||
#sent_token_annotator <- Maxent_Sent_Token_Annotator("fi") | |||
#sent_token_annotator | |||
#a1 <- annotate(s, sent_token_annotator) | |||
#a1 | |||
## Extract sentences. | |||
#s[a1] | |||
} | |||
# Level 2 header = Topic | |||
h2 <- gregexpr("<h2>(.*?)</h2>", a)[[1]] | |||
h2_out <- substr(rep(a, length(par)), h2, h2 + attributes(h2)$match.length) | |||
h2_out <- gsub("<[^>]*>", "", h2_out) | |||
out <- data.frame( | |||
Aihe = NA, | |||
JaeID = as.character(1:length(out)), | |||
Result = out | |||
) | |||
for (i in 1:length(h2)) { | |||
filter2 <- pos < c(h2, Inf)[i + 1] & pos > h2[i] | |||
out$Aihe[filter2] <- h2_out[i] | |||
} | } | ||
return(out) | return(out) | ||
} | } | ||
objects.store(get_page_ident, get_current_version, try_dl_rm, upload_with_autoid, jaehakukone) | |||
objects.store(get_page_ident, get_meta_data, get_current_version, try_dl_rm, upload_with_autoid, jaehakukone, parse_page) | |||
</rcode> | </rcode> | ||
== Kooditemplaatteja == | == Kooditemplaatteja == | ||
<pre> | <pre> | ||
Nykyinen versio 9. joulukuuta 2014 kello 14.58
| Moderaattori:Jouni (katso kaikki)
Sivun edistymistä ei ole arvioitu. Arvostuksen määrää ei ole arvioitu (ks. peer review). |
| Lisää dataa
|
Apufunktiot
Vaati plyr kirjaston
library(OpasnetUtils)
# Requires plyr
get_page_ident <- function(main_ident, page_name) {
filter <- list()
filter$Sivu <- page_name
ver <- opbase.data(main_ident, subset = "Versiot", include = filter)
out <- as.character(unique(ver$Ident))
return(out)
}
get_current_version <- function(main_ident) {
ver <- opbase.data(main_ident, subset = "Versiot")
out <- max(as.numeric(as.character(ver$Versio)))
return(out)
}
get_meta_data <- function(main_ident, page_name = NULL) {
filter <- list()
filter$Sivu <- page_name
ver <- opbase.data(main_ident, subset = "Versiot", include = filter)
return(ver)
}
try_dl_rm <- function(..., filter = NULL, rm_id = NULL, rm_ind = "JaeID") {
out <- data.frame()
first <- TRUE
for (i in list(...)) {
temp <- tryCatch(
opbase.data(
i,
include = filter[names(filter) %in% opbase.indices(i)]
),
error = function(...) return(data.frame())
)
#if (nrow(temp) == 0) print(paste("Failed to download ", i, " ", geterrmessage(), sep = ""))
if (nrow(temp) > 0) {
if (first) {
out <- rbind(out, temp)
first <- FALSE
} else {
out <- rbind.fill(out, temp[colnames(temp) %in% colnames(out)])
}
}
}
if (!is.null(rm_id)) {
#filter[[rm_ind]] <- NULL
#if (length(filter) == 0) filter <- NULL
temp <- tryCatch(
opbase.data(
rm_id,
include = filter[names(filter) %in% opbase.indices(rm_id)]
),
error = function(...) return(NULL)
)
#if (is.null(temp)) print(paste("Failed to download ", rm_id, " ", geterrmessage(), sep = ""))
if (!is.null(temp)) {
colnames(temp)[colnames(temp) == "Result"] <- rm_ind
#temp <- temp[colnames(temp)[colnames(temp) %in% colnames(out)]]
# On second thought we should only get marginals into the merge...
temp <- temp[colnames(temp)[colnames(temp) %in% c(rm_ind, "Sivu", "Versio")]]
temp$Remove_tagged <- 1
out <- join(out, temp, match = "first") # left join from plyr, preserves original order, first match is faster
out <- out[is.na(out$Remove_tagged),]
out[["Remove_tagged"]] <- NULL
}
}
return(out)
}
upload_with_autoid <- function(data, ident.subset, id_name, pagename, prefix = character(), filter = NULL) {
# Bad implementation that downloads whole data just to find next id
temp <- tryCatch(
opbase.data(ident.subset, include = filter),
error = function(...) return(NULL)
)
if (!is.null(temp)) {
obs <- gsub("[A-Za-z]", "", as.character(temp[[id_name]]))
obs <- max(as.numeric(obs)) + 1
} else {
obs <- 1
}
if (nrow(data) > 1) obs <- obs:(obs + nrow(data) - 1)
data[[id_name]] <- paste(prefix, obs, sep = "")
ident.subset <- strsplit(ident.subset, ".", fixed = TRUE)[[1]]
opbase.upload(
data,
ident = ident.subset[1],
name = pagename, # needs rcode tag variables
subset = ident.subset[2],
act_type = "append",
#language = "fin",
who = wiki_username
)
#return(data)
}
jaehakukone <- function(main_ident, sivu = NULL, versio = NULL, jae = NULL) {
filter <- list()
if (is.null(versio)) versio <- get_current_version(main_ident)
filter$Versio <- versio
filter$Sivu <- sivu
filter$JaeID <- jae
out <- try_dl_rm(
paste(main_ident, "Jakeet", sep = "."),
paste(main_ident, "Jakeenlisaykset", sep = "."),
filter = filter,
rm_id = paste(main_ident, "Jakeenpoistot", sep = "."),
rm_ind = "JaeID"
)
meta_data <- get_meta_data(main_ident, sivu)
ids <- as.character(unique(meta_data$Ident))
names <- as.character(unique(meta_data$Sivu))
# If NULL matches all pages listed in Versiot
if (length(ids) > 0) {
for (i in 1:length(ids)) {
temp <- try_dl_rm(
paste(ids[i], "Kommentit", sep = "."),
filter = filter,
rm_id = paste(ids[i], "Kommentinpoistot", sep = "."),
rm_ind = "JaeID"
)
if (nrow(temp) > 0) {
temp$Sivu <- names[i]
if(nrow(out) > 0) {
for (i in colnames(out)[!colnames(out) %in% colnames(temp)]) {
temp[[i]] <- NA
}
out <- rbind(out, temp[colnames(out)])
} else {
out <- temp
}
}
}
}
return(out)
}
parse_page <- function(url, url_args = "", nchar_threshold = 11, res = "par") {
turl <- paste(url, url_args, "&action=render", sep = "")
a <- opasnet.page(turl, wiki = "opasnet_fi")
par <- gregexpr("<p>(.*?)</p>", a)[[1]]
par_out <- substr(rep(a, length(par)), par, par + attributes(par)$match.length)
par_out <- gsub("<p>", "", par_out)
par_out <- gsub("</p>", "", par_out)
table <- gregexpr("<table[^>]*>(.*?)</table>", a)[[1]]
td <- gregexpr("<td[^>]*>(.*?)</td>", a)[[1]]
#th <- gregexpr("<th[^>]*>(.*?)</th>", a)[[1]]
table_out <- substr(rep(a, length(td)), td, td + attributes(td)$match.length)
table_out <- gsub("<td[^>]*>", "", table_out)
table_out <- gsub("</td>", "", table_out)
# Use paragraphs in table cells to break content (glitchy?)
# At least it removes duplicates of <p> (due to matching <p> search as well as <td>)
tdppos <- gregexpr("<p>(.*?)</p>", table_out)
table_out <- gsub("<p>(.*?)</p>", "<super_separator>", table_out)
tdp <- strsplit(table_out, "<super_separator>")
tdpl <- sapply(tdp, length)
temp <- list()
# Find begin positions of <p> separated bits in <td>
for (i in (1:length(tdp))[tdpl > 1]) {
temp[[i]] <- c(0, tdppos[[i]] + attributes(tdppos[[i]])$match.length) + td[i]
}
# Filter <td> with <p>
filter <- logical()
for (i in 1:length(tdp)) {
if (tdpl[i] == 1) val <- FALSE else val <- TRUE
filter <- c(filter, rep(val, tdpl[i]))
}
tdp <- unlist(tdp)[filter]
filter <- -(1:length(tdpl))[tdpl > 1]
# Filter out
table_out <- table_out[filter]
td <- td[filter]
# Add new bits and positions to end of filtered original
table_out <- c(table_out, tdp)
td <- c(td, unlist(temp))
li <- gregexpr("<li>(.*?)</li>", a)[[1]]
li_out <- substr(rep(a, length(li)), li, li + attributes(li)$match.length)
li_out <- gsub("<li>", "", li_out)
li_out <- gsub("</li>", "", li_out)
dd <- gregexpr("<dd>(.*?)</dd>", a)[[1]]
dd_out <- substr(rep(a, length(dd)), dd, dd + attributes(dd)$match.length)
dd_out <- gsub("<dd>", "", dd_out)
dd_out <- gsub("</dd>", "", dd_out)
out <- c(par_out, table_out, li_out, dd_out)
pos <- order(c(par, td, li, dd))
out <- out[pos]
pos <- c(par, td, li, dd)[pos]
filter <- nchar(out) > nchar_threshold
out <- out[filter]
pos <- pos[filter]
# Resolution beyond paragraph
if (res == "sentence") {
per <- gregexpr("\\.|\\?|!", out)
per_out <- list()
for (i in 1:length(out)) {
per_out[[i]] <- substr(
rep(out[i], length(per[[i]]) + 1),
c(0, per[[i]]) + 1,
c(per[[i]], nchar(out[i]))
)
j <- 2
while (j < length(per_out[[i]]) + 1) {
if (nchar(gsub("\\s", "", per_out[[i]][j])) < 4) {
per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
per_out[[i]] <- per_out[[i]][-j]
if (j - 1 < length(per_out[[i]])) {
per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
per_out[[i]] <- per_out[[i]][-j]
}
} else {
j <- j + 1
}
}
#j <- length(per_out[[i]])
#while (j > 0) {
# if (nchar(gsub("\\s", "", per_out[[i]][j])) < nchar_threshold) {
# per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
# per_out[[i]] <- per_out[[i]][-j]
# } else {
# j <- j - 1
# }
#}
}
out <- unlist(per_out)
pos <- rep(pos, sapply(per_out, length))
filter <- nchar(out) > nchar_threshold
out <- out[filter]
pos <- pos[filter]
out <- gsub("^\\s*|\\s*$", "", out)
out <- gsub("\\n", " ", out)
# Advanced natural language processing not available for finnish language
#require(NLP)
#require(openNLP)
#s <- as.String(s)
#sent_token_annotator <- Maxent_Sent_Token_Annotator("fi")
#sent_token_annotator
#a1 <- annotate(s, sent_token_annotator)
#a1
## Extract sentences.
#s[a1]
}
# Level 2 header = Topic
h2 <- gregexpr("<h2>(.*?)</h2>", a)[[1]]
h2_out <- substr(rep(a, length(par)), h2, h2 + attributes(h2)$match.length)
h2_out <- gsub("<[^>]*>", "", h2_out)
out <- data.frame(
Aihe = NA,
JaeID = as.character(1:length(out)),
Result = out
)
for (i in 1:length(h2)) {
filter2 <- pos < c(h2, Inf)[i + 1] & pos > h2[i]
out$Aihe[filter2] <- h2_out[i]
}
return(out)
}
objects.store(get_page_ident, get_meta_data, get_current_version, try_dl_rm, upload_with_autoid, jaehakukone, parse_page)
|
Kooditemplaatteja
#######################
# Add script template #
#######################
library(OpasnetUtils)
objects.latest("Op_fi4325", "apufunktiot")
ident.subset <- paste(page_ident, "subset", sep = ".")
version <- get_current_version("main_ident")
filter <- list()
filter$Versio <- version
dat <- data.frame(
Versio = version,
Aika = date(),
Käyttäjä = wiki_username,
Result = input
)
upload_with_autoid(
dat,
ident.subset,
id_name = "XID",
prefix = "",
filter = filter
)
Ohjeita
- Tägikone kuuluu osana laajempaan kokonaisuuteen Tuotosten jäsentäminen Opasnetissä. Katso tarkemmat ohjeet sieltä.