Tägikone

Opasnet Suomista
Versio hetkellä 9. joulukuuta 2014 kello 14.58 – tehnyt Teemu R (keskustelu | muokkaukset) (→‎Apufunktiot: jakeistajan resoluutio lausetasolle uudella parametrilla)
(ero) ← Vanhempi versio | Nykyinen versio (ero) | Uudempi versio → (ero)
Siirry navigaatioon Siirry hakuun




Apufunktiot

Vaati plyr kirjaston

- Piilota koodi

library(OpasnetUtils)

# Requires plyr

get_page_ident <- function(main_ident, page_name) {
	filter <- list()
	filter$Sivu <- page_name
	ver <- opbase.data(main_ident, subset = "Versiot", include = filter)
	out <- as.character(unique(ver$Ident))
	return(out)
}

get_current_version <- function(main_ident) {
	ver <- opbase.data(main_ident, subset = "Versiot")
	out <- max(as.numeric(as.character(ver$Versio)))
	return(out)
}

get_meta_data <- function(main_ident, page_name = NULL) {
	filter <- list()
	filter$Sivu <- page_name
	ver <- opbase.data(main_ident, subset = "Versiot", include = filter)
	return(ver)
}

try_dl_rm <- function(..., filter = NULL, rm_id = NULL, rm_ind = "JaeID") {
	out <- data.frame()
	first <- TRUE
	for (i in list(...)) {
		temp <- tryCatch(
			opbase.data(
				i, 
				include = filter[names(filter) %in% opbase.indices(i)]
			), 
			error = function(...) return(data.frame())
		)
		#if (nrow(temp) == 0) print(paste("Failed to download ", i, " ", geterrmessage(), sep = ""))
		if (nrow(temp) > 0) {
			if (first) {
				out <- rbind(out, temp)
				first <- FALSE
			} else {
				out <- rbind.fill(out, temp[colnames(temp) %in% colnames(out)])
			}
		}
	}
	if (!is.null(rm_id)) {
		#filter[[rm_ind]] <- NULL
		#if (length(filter) == 0) filter <- NULL
		temp <- tryCatch(
			opbase.data(
				rm_id, 
				include = filter[names(filter) %in% opbase.indices(rm_id)]
			), 
			error = function(...) return(NULL)
		)
		#if (is.null(temp)) print(paste("Failed to download ", rm_id, " ", geterrmessage(), sep = ""))
		if (!is.null(temp)) {
			colnames(temp)[colnames(temp) == "Result"] <- rm_ind
			#temp <- temp[colnames(temp)[colnames(temp) %in% colnames(out)]]
			# On second thought we should only get marginals into the merge... 
			temp <- temp[colnames(temp)[colnames(temp) %in% c(rm_ind, "Sivu", "Versio")]]
			temp$Remove_tagged <- 1
			out <- join(out, temp, match = "first") # left join from plyr, preserves original order, first match is faster
			out <- out[is.na(out$Remove_tagged),]
			out[["Remove_tagged"]] <- NULL
		}
	}
	return(out)
}

upload_with_autoid <- function(data, ident.subset, id_name, pagename, prefix = character(), filter = NULL) {
	# Bad implementation that downloads whole data just to find next id
	temp <- tryCatch(
			opbase.data(ident.subset, include = filter), 
			error = function(...) return(NULL)
	)
	if (!is.null(temp)) {
		obs <- gsub("[A-Za-z]", "", as.character(temp[[id_name]]))
		obs <- max(as.numeric(obs)) + 1
	} else {
		obs <- 1
	}
	if (nrow(data) > 1) obs <- obs:(obs + nrow(data) - 1)
	data[[id_name]] <- paste(prefix, obs, sep = "")
	ident.subset <- strsplit(ident.subset, ".", fixed = TRUE)[[1]]
	opbase.upload(
			data, 
			ident = ident.subset[1], 
			name = pagename, # needs rcode tag variables
			subset = ident.subset[2], 
			act_type = "append", 
			#language = "fin", 
			who = wiki_username
	)
	#return(data)
}

jaehakukone <- function(main_ident, sivu = NULL, versio = NULL, jae = NULL) {
	filter <- list()
	if (is.null(versio)) versio <- get_current_version(main_ident)
	filter$Versio <- versio
	filter$Sivu <- sivu
	filter$JaeID <- jae
	
	out <- try_dl_rm(
		paste(main_ident, "Jakeet", sep = "."), 
		paste(main_ident, "Jakeenlisaykset", sep = "."),
		filter = filter,
		rm_id = paste(main_ident, "Jakeenpoistot", sep = "."),
		rm_ind = "JaeID"
	)
	
	meta_data <- get_meta_data(main_ident, sivu)
	ids <- as.character(unique(meta_data$Ident))
	names <- as.character(unique(meta_data$Sivu))
	
	# If NULL matches all pages listed in Versiot
	if (length(ids) > 0) {
		for (i in 1:length(ids)) {
			temp <- try_dl_rm(
				paste(ids[i], "Kommentit", sep = "."), 
				filter = filter,
				rm_id = paste(ids[i], "Kommentinpoistot", sep = "."),
				rm_ind = "JaeID"
			)
			if (nrow(temp) > 0)	{
				temp$Sivu <- names[i]
				if(nrow(out) > 0) {
					for (i in colnames(out)[!colnames(out) %in% colnames(temp)]) {
						temp[[i]] <- NA
					}
					out <- rbind(out, temp[colnames(out)])
				} else {
					out <- temp
				}
			}
		}
	}
	
	return(out)
}

parse_page <- function(url, url_args = "", nchar_threshold = 11, res = "par") {
	turl <- paste(url, url_args, "&action=render", sep = "")
	a <- opasnet.page(turl, wiki = "opasnet_fi")
	
	par <- gregexpr("<p>(.*?)</p>", a)[[1]]
	par_out <- substr(rep(a, length(par)), par, par + attributes(par)$match.length)
	par_out <- gsub("<p>", "", par_out)
	par_out <- gsub("</p>", "", par_out)
	
	table <- gregexpr("<table[^>]*>(.*?)</table>", a)[[1]]
	td <- gregexpr("<td[^>]*>(.*?)</td>", a)[[1]]
	#th <- gregexpr("<th[^>]*>(.*?)</th>", a)[[1]]
	
	table_out <- substr(rep(a, length(td)), td, td + attributes(td)$match.length)
	table_out <- gsub("<td[^>]*>", "", table_out)
	table_out <- gsub("</td>", "", table_out)
	
	# Use paragraphs in table cells to break content (glitchy?)
	# At least it removes duplicates of <p> (due to matching <p> search as well as <td>)
	tdppos <- gregexpr("<p>(.*?)</p>", table_out)
	table_out <- gsub("<p>(.*?)</p>", "<super_separator>", table_out)
	tdp <- strsplit(table_out, "<super_separator>")
	tdpl <- sapply(tdp, length)
	temp <- list()
	# Find begin positions of <p> separated bits in <td>
	for (i in (1:length(tdp))[tdpl > 1]) {
		temp[[i]] <- c(0, tdppos[[i]] + attributes(tdppos[[i]])$match.length) + td[i]
	}
	# Filter <td> with <p>
	filter <- logical()
	for (i in 1:length(tdp)) {
		if (tdpl[i] == 1) val <- FALSE else val <- TRUE
		filter <- c(filter, rep(val, tdpl[i]))
	}
	tdp <- unlist(tdp)[filter]
	filter <- -(1:length(tdpl))[tdpl > 1]
	# Filter out
	table_out <- table_out[filter]
	td <- td[filter]
	# Add new bits and positions to end of filtered original
	table_out <- c(table_out, tdp)
	td <- c(td, unlist(temp))
	
	li <- gregexpr("<li>(.*?)</li>", a)[[1]]
	li_out <- substr(rep(a, length(li)), li, li + attributes(li)$match.length)
	li_out <- gsub("<li>", "", li_out)
	li_out <- gsub("</li>", "", li_out)
	
	dd <- gregexpr("<dd>(.*?)</dd>", a)[[1]]
	dd_out <- substr(rep(a, length(dd)), dd, dd + attributes(dd)$match.length)
	dd_out <- gsub("<dd>", "", dd_out)
	dd_out <- gsub("</dd>", "", dd_out)
	
	out <- c(par_out, table_out, li_out, dd_out)
	pos <- order(c(par, td, li, dd))
	out <- out[pos]
	pos <- c(par, td, li, dd)[pos]
	
	filter <- nchar(out) > nchar_threshold
	out <- out[filter]
	pos <- pos[filter]
	
	# Resolution beyond paragraph
	if (res == "sentence") {
		per <- gregexpr("\\.|\\?|!", out)
		per_out <- list()
		for (i in 1:length(out)) {
			per_out[[i]] <- substr(
				rep(out[i], length(per[[i]]) + 1), 
				c(0, per[[i]]) + 1, 
				c(per[[i]], nchar(out[i]))
			)
			j <- 2
			while (j < length(per_out[[i]]) + 1) {
				if (nchar(gsub("\\s", "", per_out[[i]][j])) < 4) {
					per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
					per_out[[i]] <- per_out[[i]][-j]
					if (j - 1 < length(per_out[[i]])) {
						per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
						per_out[[i]] <- per_out[[i]][-j]
					}
				} else {
					j <- j + 1
				}
			}
			#j <- length(per_out[[i]])
			#while (j > 0) {
			#	if (nchar(gsub("\\s", "", per_out[[i]][j])) < nchar_threshold) {
			#		per_out[[i]][j - 1] <- paste(per_out[[i]][j - 1], per_out[[i]][j], sep = "")
			#		per_out[[i]] <- per_out[[i]][-j]
			#	} else {
			#		j <- j - 1
			#	}
			#}
		}
		out <- unlist(per_out)
		pos <- rep(pos, sapply(per_out, length))
		
		filter <- nchar(out) > nchar_threshold
		out <- out[filter]
		pos <- pos[filter]
		
		out <- gsub("^\\s*|\\s*$", "", out)
		out <- gsub("\\n", " ", out)
		
		# Advanced natural language processing not available for finnish language
		#require(NLP)
		#require(openNLP)
		#s <- as.String(s)
		#sent_token_annotator <- Maxent_Sent_Token_Annotator("fi")
		#sent_token_annotator
		#a1 <- annotate(s, sent_token_annotator)
		#a1
		## Extract sentences.
		#s[a1]
	}
	
	
	# Level 2 header = Topic
	h2 <- gregexpr("<h2>(.*?)</h2>", a)[[1]]
	h2_out <- substr(rep(a, length(par)), h2, h2 + attributes(h2)$match.length)
	h2_out <- gsub("<[^>]*>", "", h2_out)
	
	out <- data.frame(
		Aihe = NA, 
		JaeID = as.character(1:length(out)), 
		Result = out
	)
	
	for (i in 1:length(h2)) {
		filter2 <- pos < c(h2, Inf)[i + 1] & pos > h2[i]
		out$Aihe[filter2] <- h2_out[i]
	}
	
	return(out)
}

objects.store(get_page_ident, get_meta_data, get_current_version, try_dl_rm, upload_with_autoid, jaehakukone, parse_page)

Kooditemplaatteja

#######################
# Add script template #
#######################

library(OpasnetUtils)
objects.latest("Op_fi4325", "apufunktiot")

ident.subset <- paste(page_ident, "subset", sep = ".")

version <- get_current_version("main_ident")

filter <- list()
filter$Versio <- version

dat <- data.frame(
	Versio = version, 
	Aika = date(), 
	Käyttäjä = wiki_username, 
	Result = input
)

upload_with_autoid(
	dat,
	ident.subset,
	id_name = "XID",
	prefix = "",
	filter = filter
)

Ohjeita

Tägikone kuuluu osana laajempaan kokonaisuuteen Tuotosten jäsentäminen Opasnetissä. Katso tarkemmat ohjeet sieltä.