Ero sivun ”Tägikone” versioiden välillä

Opasnet Suomista
Siirry navigaatioon Siirry hakuun
(uusi versio)
(→‎Apufunktiot: jakeistajan resoluutio lausetasolle uudella parametrilla)
 
(15 välissä olevaa versiota samalta käyttäjältä ei näytetä)
Rivi 2: Rivi 2:


== Apufunktiot ==
== Apufunktiot ==
<rcode label='Alusta funktiot' name='apufunktiot' embed=1>
'''Vaati plyr kirjaston'''
<rcode label='Alusta funktiot' name='apufunktiot' embed=1 showcode=1>
library(OpasnetUtils)
library(OpasnetUtils)
# Requires plyr


get_page_ident <- function(main_ident, page_name) {
get_page_ident <- function(main_ident, page_name) {
Rivi 17: Rivi 20:
out <- max(as.numeric(as.character(ver$Versio)))
out <- max(as.numeric(as.character(ver$Versio)))
return(out)
return(out)
}
get_meta_data <- function(main_ident, page_name = NULL) {
filter <- list()
filter$Sivu <- page_name
ver <- opbase.data(main_ident, subset = "Versiot", include = filter)
return(ver)
}
}


Rivi 26: Rivi 36:
opbase.data(
opbase.data(
i,  
i,  
include = filter
include = filter[names(filter) %in% opbase.indices(i)]
),  
),  
error = function(...) return(data.frame())
error = function(...) return(data.frame())
Rivi 36: Rivi 46:
first <- FALSE
first <- FALSE
} else {
} else {
out <- rbind(out, temp[colnames(out)]) # In case other tables include extra columns.
out <- rbind.fill(out, temp[colnames(temp) %in% colnames(out)])
}
}
}
}
}
}
if (!is.null(rm_id)) {
if (!is.null(rm_id)) {
#filter[[rm_ind]] <- NULL
#if (length(filter) == 0) filter <- NULL
temp <- tryCatch(
temp <- tryCatch(
opbase.data(
opbase.data(
rm_id,  
rm_id,  
include = filter
include = filter[names(filter) %in% opbase.indices(rm_id)]
),  
),  
error = function(...) return(NULL)
error = function(...) return(NULL)
Rivi 55: Rivi 67:
temp <- temp[colnames(temp)[colnames(temp) %in% c(rm_ind, "Sivu", "Versio")]]
temp <- temp[colnames(temp)[colnames(temp) %in% c(rm_ind, "Sivu", "Versio")]]
temp$Remove_tagged <- 1
temp$Remove_tagged <- 1
out <- merge(out, temp, all.x = TRUE)
out <- join(out, temp, match = "first") # left join from plyr, preserves original order, first match is faster
out <- out[is.na(out$Remove_tagged),]
out <- out[is.na(out$Remove_tagged),]
out[["Remove_tagged"]] <- NULL
out[["Remove_tagged"]] <- NULL
Rivi 63: Rivi 75:
}
}


 
upload_with_autoid <- function(data, ident.subset, id_name, pagename, prefix = character(), filter = NULL) {
upload_with_autoid <- function(data, ident.subset, id_name, prefix = character(), filter = NULL) {
# Bad implementation that downloads whole data just to find next id
# Bad implementation that downloads whole data just to find next id
temp <- tryCatch(
temp <- tryCatch(
opbase.data(ident.subset, include = filter),  
opbase.data(ident.subset, include = filter),  
error = function(...) return(NULL)
error = function(...) return(NULL)
)
)
if (!is.null(temp)) {
if (!is.null(temp)) {
Rivi 77: Rivi 88:
}
}
if (nrow(data) > 1) obs <- obs:(obs + nrow(data) - 1)
if (nrow(data) > 1) obs <- obs:(obs + nrow(data) - 1)
data[[id_name]] <- obs
data[[id_name]] <- paste(prefix, obs, sep = "")
ident.subset <- strsplit(ident.subset, ".", fixed = TRUE)[[1]]
ident.subset <- strsplit(ident.subset, ".", fixed = TRUE)[[1]]
opbase.upload(
opbase.upload(
data,  
data,  
ident = ident.subset[1],  
ident = ident.subset[1],  
name = pagename, # needs rcode tag variables
name = pagename, # needs rcode tag variables
subset = ident.subset[2],  
subset = ident.subset[2],  
act_type = "append",  
act_type = "append",  
language = "fin",  
#language = "fin",  
who = wiki_username
who = wiki_username
)
)
#return(data)
}
}


Rivi 99: Rivi 111:
out <- try_dl_rm(
out <- try_dl_rm(
paste(main_ident, "Jakeet", sep = "."),  
paste(main_ident, "Jakeet", sep = "."),  
paste(main_ident, "Jakeenlisays", sep = "."),
paste(main_ident, "Jakeenlisaykset", sep = "."),
filter = filter,
filter = filter,
rm_id = paste(main_ident, "Jakeenpoistot", sep = "."),
rm_id = paste(main_ident, "Jakeenpoistot", sep = "."),
Rivi 105: Rivi 117:
)
)
ids <- get_page_ident(main_ident, sivu)
meta_data <- get_meta_data(main_ident, sivu)
ids <- as.character(unique(meta_data$Ident))
names <- as.character(unique(meta_data$Sivu))
# If NULL matches all pages listed in Versiot
# If NULL matches all pages listed in Versiot
for (i in ids) {
if (length(ids) > 0) {
temp <- try_dl_rm(
for (i in 1:length(ids)) {
paste(main_ident, "Kommentit", sep = "."),  
temp <- try_dl_rm(
filter = filter,
paste(ids[i], "Kommentit", sep = "."),  
rm_id = paste(main_ident, "Kommentinpoistot", sep = "."),
filter = filter,
rm_ind = "JaeID"
rm_id = paste(ids[i], "Kommentinpoistot", sep = "."),
)
rm_ind = "JaeID"
if (nrow(temp) > 0) out <- rbind(out, temp[colnames(out)])
)
if (nrow(temp) > 0) {
temp$Sivu <- names[i]
if(nrow(out) > 0) {
for (i in colnames(out)[!colnames(out) %in% colnames(temp)]) {
temp[[i]] <- NA
}
out <- rbind(out, temp[colnames(out)])
} else {
out <- temp
}
}
}
}
return(out)
}
 
parse_page <- function(url, url_args = "", nchar_threshold = 11, res = "par") {
turl <- paste(url, url_args, "&action=render", sep = "")
a <- opasnet.page(turl, wiki = "opasnet_fi")
par <- gregexpr("<p>(.*?)</p>", a)[[1]]
par_out <- substr(rep(a, length(par)), par, par + attributes(par)$match.length)
par_out <- gsub("<p>", "", par_out)
par_out <- gsub("</p>", "", par_out)
table <- gregexpr("<table[^>]*>(.*?)</table>", a)[[1]]
td <- gregexpr("<td[^>]*>(.*?)</td>", a)[[1]]
#th <- gregexpr("<th[^>]*>(.*?)</th>", a)[[1]]
table_out <- substr(rep(a, length(td)), td, td + attributes(td)$match.length)
table_out <- gsub("<td[^>]*>", "", table_out)
table_out <- gsub("</td>", "", table_out)
# Use paragraphs in table cells to break content (glitchy?)
# At least it removes duplicates of <p> (due to matching <p> search as well as <td>)
tdppos <- gregexpr("<p>(.*?)</p>", table_out)
table_out <- gsub("<p>(.*?)</p>", "<super_separator>", table_out)
tdp <- strsplit(table_out, "<super_separator>")
tdpl <- sapply(tdp, length)
temp <- list()
# Find begin positions of <p> separated bits in <td>
for (i in (1:length(tdp))[tdpl > 1]) {
temp[[i]] <- c(0, tdppos[[i]] + attributes(tdppos[[i]])$match.length) + td[i]
}
# Filter <td> with <p>
filter <- logical()
for (i in 1:length(tdp)) {
if (tdpl[i] == 1) val <- FALSE else val <- TRUE
filter <- c(filter, rep(val, tdpl[i]))
}
tdp <- unlist(tdp)[filter]
filter <- -(1:length(tdpl))[tdpl > 1]
# Filter out
table_out <- table_out[filter]
td <- td[filter]
# Add new bits and positions to end of filtered original
table_out <- c(table_out, tdp)
td <- c(td, unlist(temp))
li <- gregexpr("<li>(.*?)</li>", a)[[1]]
li_out <- substr(rep(a, length(li)), li, li + attributes(li)$match.length)
li_out <- gsub("<li>", "", li_out)
li_out <- gsub("</li>", "", li_out)
dd <- gregexpr("<dd>(.*?)</dd>", a)[[1]]
dd_out <- substr(rep(a, length(dd)), dd, dd + attributes(dd)$match.length)
dd_out <- gsub("<dd>", "", dd_out)
dd_out <- gsub("</dd>", "", dd_out)
out <- c(par_out, table_out, li_out, dd_out)
pos <- order(c(par, td, li, dd))
out <- out[pos]
pos <- c(par, td, li, dd)[pos]
filter <- nchar(out) > nchar_threshold
out <- out[filter]
pos <- pos[filter]
# Resolution beyond paragraph
if (res == "sentence") {
per <- gregexpr("\\.|\\?|!", out)
per_out <- list()
for (i in 1:length(out)) {
per_out[[i]] <- substr(
rep(out[i], length(per[[i]]) + 1),
c(0, per[[i]]) + 1,
c(per[[i]], nchar(out[i]))
)
j <- 2
while (j < length(per_out[[i]]) + 1) {
if (nchar(gsub("\\s", "", per_out[[i]][j])) < 4) {
per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
per_out[[i]] <- per_out[[i]][-j]
if (j - 1 < length(per_out[[i]])) {
per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
per_out[[i]] <- per_out[[i]][-j]
}
} else {
j <- j + 1
}
}
#j <- length(per_out[[i]])
#while (j > 0) {
# if (nchar(gsub("\\s", "", per_out[[i]][j])) < nchar_threshold) {
# per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
# per_out[[i]] <- per_out[[i]][-j]
# } else {
# j <- j - 1
# }
#}
}
out <- unlist(per_out)
pos <- rep(pos, sapply(per_out, length))
filter <- nchar(out) > nchar_threshold
out <- out[filter]
pos <- pos[filter]
out <- gsub("^\\s*|\\s*$", "", out)
out <- gsub("\\n", " ", out)
# Advanced natural language processing not available for finnish language
#require(NLP)
#require(openNLP)
#s <- as.String(s)
#sent_token_annotator <- Maxent_Sent_Token_Annotator("fi")
#sent_token_annotator
#a1 <- annotate(s, sent_token_annotator)
#a1
## Extract sentences.
#s[a1]
}
# Level 2 header = Topic
h2 <- gregexpr("<h2>(.*?)</h2>", a)[[1]]
h2_out <- substr(rep(a, length(par)), h2, h2 + attributes(h2)$match.length)
h2_out <- gsub("<[^>]*>", "", h2_out)
out <- data.frame(
Aihe = NA,
JaeID = as.character(1:length(out)),
Result = out
)
for (i in 1:length(h2)) {
filter2 <- pos < c(h2, Inf)[i + 1] & pos > h2[i]
out$Aihe[filter2] <- h2_out[i]
}
}
return(out)
return(out)
}
}
objects.store(get_page_ident, get_current_version, try_dl_rm, upload_with_autoid, jaehakukone)
 
objects.store(get_page_ident, get_meta_data, get_current_version, try_dl_rm, upload_with_autoid, jaehakukone, parse_page)
</rcode>
</rcode>
== Kooditemplaatteja ==
== Kooditemplaatteja ==
<pre>
<pre>

Nykyinen versio 9. joulukuuta 2014 kello 14.58




Apufunktiot

Vaati plyr kirjaston

- Piilota koodi

library(OpasnetUtils)

# Requires plyr

get_page_ident <- function(main_ident, page_name) {
	filter <- list()
	filter$Sivu <- page_name
	ver <- opbase.data(main_ident, subset = "Versiot", include = filter)
	out <- as.character(unique(ver$Ident))
	return(out)
}

get_current_version <- function(main_ident) {
	ver <- opbase.data(main_ident, subset = "Versiot")
	out <- max(as.numeric(as.character(ver$Versio)))
	return(out)
}

get_meta_data <- function(main_ident, page_name = NULL) {
	filter <- list()
	filter$Sivu <- page_name
	ver <- opbase.data(main_ident, subset = "Versiot", include = filter)
	return(ver)
}

try_dl_rm <- function(..., filter = NULL, rm_id = NULL, rm_ind = "JaeID") {
	out <- data.frame()
	first <- TRUE
	for (i in list(...)) {
		temp <- tryCatch(
			opbase.data(
				i, 
				include = filter[names(filter) %in% opbase.indices(i)]
			), 
			error = function(...) return(data.frame())
		)
		#if (nrow(temp) == 0) print(paste("Failed to download ", i, " ", geterrmessage(), sep = ""))
		if (nrow(temp) > 0) {
			if (first) {
				out <- rbind(out, temp)
				first <- FALSE
			} else {
				out <- rbind.fill(out, temp[colnames(temp) %in% colnames(out)])
			}
		}
	}
	if (!is.null(rm_id)) {
		#filter[[rm_ind]] <- NULL
		#if (length(filter) == 0) filter <- NULL
		temp <- tryCatch(
			opbase.data(
				rm_id, 
				include = filter[names(filter) %in% opbase.indices(rm_id)]
			), 
			error = function(...) return(NULL)
		)
		#if (is.null(temp)) print(paste("Failed to download ", rm_id, " ", geterrmessage(), sep = ""))
		if (!is.null(temp)) {
			colnames(temp)[colnames(temp) == "Result"] <- rm_ind
			#temp <- temp[colnames(temp)[colnames(temp) %in% colnames(out)]]
			# On second thought we should only get marginals into the merge... 
			temp <- temp[colnames(temp)[colnames(temp) %in% c(rm_ind, "Sivu", "Versio")]]
			temp$Remove_tagged <- 1
			out <- join(out, temp, match = "first") # left join from plyr, preserves original order, first match is faster
			out <- out[is.na(out$Remove_tagged),]
			out[["Remove_tagged"]] <- NULL
		}
	}
	return(out)
}

upload_with_autoid <- function(data, ident.subset, id_name, pagename, prefix = character(), filter = NULL) {
	# Bad implementation that downloads whole data just to find next id
	temp <- tryCatch(
			opbase.data(ident.subset, include = filter), 
			error = function(...) return(NULL)
	)
	if (!is.null(temp)) {
		obs <- gsub("[A-Za-z]", "", as.character(temp[[id_name]]))
		obs <- max(as.numeric(obs)) + 1
	} else {
		obs <- 1
	}
	if (nrow(data) > 1) obs <- obs:(obs + nrow(data) - 1)
	data[[id_name]] <- paste(prefix, obs, sep = "")
	ident.subset <- strsplit(ident.subset, ".", fixed = TRUE)[[1]]
	opbase.upload(
			data, 
			ident = ident.subset[1], 
			name = pagename, # needs rcode tag variables
			subset = ident.subset[2], 
			act_type = "append", 
			#language = "fin", 
			who = wiki_username
	)
	#return(data)
}

jaehakukone <- function(main_ident, sivu = NULL, versio = NULL, jae = NULL) {
	filter <- list()
	if (is.null(versio)) versio <- get_current_version(main_ident)
	filter$Versio <- versio
	filter$Sivu <- sivu
	filter$JaeID <- jae
	
	out <- try_dl_rm(
		paste(main_ident, "Jakeet", sep = "."), 
		paste(main_ident, "Jakeenlisaykset", sep = "."),
		filter = filter,
		rm_id = paste(main_ident, "Jakeenpoistot", sep = "."),
		rm_ind = "JaeID"
	)
	
	meta_data <- get_meta_data(main_ident, sivu)
	ids <- as.character(unique(meta_data$Ident))
	names <- as.character(unique(meta_data$Sivu))
	
	# If NULL matches all pages listed in Versiot
	if (length(ids) > 0) {
		for (i in 1:length(ids)) {
			temp <- try_dl_rm(
				paste(ids[i], "Kommentit", sep = "."), 
				filter = filter,
				rm_id = paste(ids[i], "Kommentinpoistot", sep = "."),
				rm_ind = "JaeID"
			)
			if (nrow(temp) > 0)	{
				temp$Sivu <- names[i]
				if(nrow(out) > 0) {
					for (i in colnames(out)[!colnames(out) %in% colnames(temp)]) {
						temp[[i]] <- NA
					}
					out <- rbind(out, temp[colnames(out)])
				} else {
					out <- temp
				}
			}
		}
	}
	
	return(out)
}

parse_page <- function(url, url_args = "", nchar_threshold = 11, res = "par") {
	turl <- paste(url, url_args, "&action=render", sep = "")
	a <- opasnet.page(turl, wiki = "opasnet_fi")
	
	par <- gregexpr("<p>(.*?)</p>", a)[[1]]
	par_out <- substr(rep(a, length(par)), par, par + attributes(par)$match.length)
	par_out <- gsub("<p>", "", par_out)
	par_out <- gsub("</p>", "", par_out)
	
	table <- gregexpr("<table[^>]*>(.*?)</table>", a)[[1]]
	td <- gregexpr("<td[^>]*>(.*?)</td>", a)[[1]]
	#th <- gregexpr("<th[^>]*>(.*?)</th>", a)[[1]]
	
	table_out <- substr(rep(a, length(td)), td, td + attributes(td)$match.length)
	table_out <- gsub("<td[^>]*>", "", table_out)
	table_out <- gsub("</td>", "", table_out)
	
	# Use paragraphs in table cells to break content (glitchy?)
	# At least it removes duplicates of <p> (due to matching <p> search as well as <td>)
	tdppos <- gregexpr("<p>(.*?)</p>", table_out)
	table_out <- gsub("<p>(.*?)</p>", "<super_separator>", table_out)
	tdp <- strsplit(table_out, "<super_separator>")
	tdpl <- sapply(tdp, length)
	temp <- list()
	# Find begin positions of <p> separated bits in <td>
	for (i in (1:length(tdp))[tdpl > 1]) {
		temp[[i]] <- c(0, tdppos[[i]] + attributes(tdppos[[i]])$match.length) + td[i]
	}
	# Filter <td> with <p>
	filter <- logical()
	for (i in 1:length(tdp)) {
		if (tdpl[i] == 1) val <- FALSE else val <- TRUE
		filter <- c(filter, rep(val, tdpl[i]))
	}
	tdp <- unlist(tdp)[filter]
	filter <- -(1:length(tdpl))[tdpl > 1]
	# Filter out
	table_out <- table_out[filter]
	td <- td[filter]
	# Add new bits and positions to end of filtered original
	table_out <- c(table_out, tdp)
	td <- c(td, unlist(temp))
	
	li <- gregexpr("<li>(.*?)</li>", a)[[1]]
	li_out <- substr(rep(a, length(li)), li, li + attributes(li)$match.length)
	li_out <- gsub("<li>", "", li_out)
	li_out <- gsub("</li>", "", li_out)
	
	dd <- gregexpr("<dd>(.*?)</dd>", a)[[1]]
	dd_out <- substr(rep(a, length(dd)), dd, dd + attributes(dd)$match.length)
	dd_out <- gsub("<dd>", "", dd_out)
	dd_out <- gsub("</dd>", "", dd_out)
	
	out <- c(par_out, table_out, li_out, dd_out)
	pos <- order(c(par, td, li, dd))
	out <- out[pos]
	pos <- c(par, td, li, dd)[pos]
	
	filter <- nchar(out) > nchar_threshold
	out <- out[filter]
	pos <- pos[filter]
	
	# Resolution beyond paragraph
	if (res == "sentence") {
		per <- gregexpr("\\.|\\?|!", out)
		per_out <- list()
		for (i in 1:length(out)) {
			per_out[[i]] <- substr(
				rep(out[i], length(per[[i]]) + 1), 
				c(0, per[[i]]) + 1, 
				c(per[[i]], nchar(out[i]))
			)
			j <- 2
			while (j < length(per_out[[i]]) + 1) {
				if (nchar(gsub("\\s", "", per_out[[i]][j])) < 4) {
					per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
					per_out[[i]] <- per_out[[i]][-j]
					if (j - 1 < length(per_out[[i]])) {
						per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
						per_out[[i]] <- per_out[[i]][-j]
					}
				} else {
					j <- j + 1
				}
			}
			#j <- length(per_out[[i]])
			#while (j > 0) {
			#	if (nchar(gsub("\\s", "", per_out[[i]][j])) < nchar_threshold) {
			#		per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
			#		per_out[[i]] <- per_out[[i]][-j]
			#	} else {
			#		j <- j - 1
			#	}
			#}
		}
		out <- unlist(per_out)
		pos <- rep(pos, sapply(per_out, length))
		
		filter <- nchar(out) > nchar_threshold
		out <- out[filter]
		pos <- pos[filter]
		
		out <- gsub("^\\s*|\\s*$", "", out)
		out <- gsub("\\n", " ", out)
		
		# Advanced natural language processing not available for finnish language
		#require(NLP)
		#require(openNLP)
		#s <- as.String(s)
		#sent_token_annotator <- Maxent_Sent_Token_Annotator("fi")
		#sent_token_annotator
		#a1 <- annotate(s, sent_token_annotator)
		#a1
		## Extract sentences.
		#s[a1]
	}
	
	
	# Level 2 header = Topic
	h2 <- gregexpr("<h2>(.*?)</h2>", a)[[1]]
	h2_out <- substr(rep(a, length(par)), h2, h2 + attributes(h2)$match.length)
	h2_out <- gsub("<[^>]*>", "", h2_out)
	
	out <- data.frame(
		Aihe = NA, 
		JaeID = as.character(1:length(out)), 
		Result = out
	)
	
	for (i in 1:length(h2)) {
		filter2 <- pos < c(h2, Inf)[i + 1] & pos > h2[i]
		out$Aihe[filter2] <- h2_out[i]
	}
	
	return(out)
}

objects.store(get_page_ident, get_meta_data, get_current_version, try_dl_rm, upload_with_autoid, jaehakukone, parse_page)

Kooditemplaatteja

#######################
# Add script template #
#######################

library(OpasnetUtils)
objects.latest("Op_fi4325", "apufunktiot")

ident.subset <- paste(page_ident, "subset", sep = ".")

version <- get_current_version("main_ident")

filter <- list()
filter$Versio <- version

dat <- data.frame(
	Versio = version, 
	Aika = date(), 
	Käyttäjä = wiki_username, 
	Result = input
)

upload_with_autoid(
	dat,
	ident.subset,
	id_name = "XID",
	prefix = "",
	filter = filter
)

Ohjeita

Tägikone kuuluu osana laajempaan kokonaisuuteen Tuotosten jäsentäminen Opasnetissä. Katso tarkemmat ohjeet sieltä.