Initial commit
p# new file: runwikiq.sh
This commit is contained in:
172
lib-00-utils.R
Normal file
172
lib-00-utils.R
Normal file
@@ -0,0 +1,172 @@
|
||||
# Library containing helper functions
|
||||
# Copyright (C) 2018 Nathan TeBlunthuis
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
library(parallel)
|
||||
library(urltools)
|
||||
library(data.table)
|
||||
library(texreg)
|
||||
|
||||
## load wikiq data for all wikis in the wiki list
|
||||
## this wikiq data doesn't have persistent word revisions
|
||||
## It doesn't collapse user edits either. we identify user sessions as well
|
||||
load.wikiq.file <- function(path){
|
||||
d <- fread(paste0(path),
|
||||
colClasses=list(character=c("reverteds", "date_time", "editor", "title")),
|
||||
na.string="", stringsAsFactors=TRUE, quote="",drop=c("sha1","minor"))
|
||||
gc()
|
||||
setnames(d, gsub('_', '.', colnames(d)))
|
||||
setkey(d, "revid")
|
||||
d$date.time <- as.POSIXct(as.character(d$date.time),
|
||||
format="%Y-%m-%d %H:%M:%S",
|
||||
tz="UTC")
|
||||
|
||||
d[, ':='(editor = as.factor(url_decode(as.character(editor))), title = as.factor(url_decode(as.character(title))))]
|
||||
|
||||
d[d$editor == "127.0.0.1","anon"] <- FALSE
|
||||
|
||||
# drop edits made before mediawiki was written
|
||||
d <- d[d$date.time > as.POSIXct("2002-01-22",timezone="UTC"),]
|
||||
|
||||
## drop wikia edits made after 2010-04-10, when data was collected
|
||||
if(wiki.list$wiki.type == "wikia"){
|
||||
d <- d[d$date.time < as.POSIXct("2010-04-10",timezone="UTC"),]
|
||||
}
|
||||
|
||||
# created "reverted" which captures whether an edit has been identity
|
||||
# reverted within the revert RADIUS (currently 15 edits).
|
||||
if (!any(d$revert)) {
|
||||
d$reverted <- FALSE
|
||||
## we need to reorder the columns in this case
|
||||
## the merge in the other case also reorders columns
|
||||
setcolorder(d,c("revid",names(d)[!grepl("revid",names(d))]))
|
||||
|
||||
} else {
|
||||
reverteds <- d$reverteds[d$revert]
|
||||
|
||||
if (!any(grepl(",", d$reverteds))) {
|
||||
reverteds <- unique(as.integer(as.character(d$reverteds)))
|
||||
} else {
|
||||
reverteds <- unique(as.integer(unlist(strsplit(as.character(reverteds), ","))))
|
||||
}
|
||||
|
||||
reverteds <- data.table(revid=reverteds, reverted=TRUE)
|
||||
d <- merge(d, reverteds, all.x=TRUE)
|
||||
d$reverted[is.na(d$reverted)] <- FALSE
|
||||
}
|
||||
|
||||
# "new.id" indicates whether this is a first-time editor
|
||||
setkey(d, "date.time")
|
||||
d$new.account <- !duplicated(d$editor)
|
||||
d$new.account[is.na(d$editor)] <- FALSE
|
||||
d$total.edits <- length(d$revid)
|
||||
d$total.sessions <- seq(1, nrow(d))
|
||||
d$total.editors <- cumsum(d$new.account)
|
||||
d$total.pages <- cumsum(!duplicated(d$articleid))
|
||||
|
||||
## add the wiki name to the dt
|
||||
|
||||
## remove edits not in the namespaces we care about
|
||||
d <- d[namespace %in% c(0,1,3,4),]
|
||||
return(d)
|
||||
}
|
||||
|
||||
load.wikiq.files <- function(i,wiki.list, path="wikiq_wikia_2010_all_nopersistence/"){
|
||||
wiki.filename = wiki.list[i,filename]
|
||||
wiki <- wiki.list[i,wiki]
|
||||
print(wiki)
|
||||
d <- load.wikiq.file(paste0(path,wiki.filename))
|
||||
|
||||
d$wiki.name <- rep(wiki,nrow(d))
|
||||
d$wiki.type <- rep(wiki.list[i,wiki.type],nrow(d))
|
||||
d[,time.first.edit := min(date.time),by=.(editor.id, wiki.name)]
|
||||
|
||||
return(d)
|
||||
}
|
||||
|
||||
remember <- function (v, k, silent=FALSE) {
|
||||
if (!exists("r")){
|
||||
rfilename = "remember.RDS"
|
||||
if(file.exists(rfilename)){
|
||||
|
||||
r <<- readRDS(rfilename)
|
||||
}
|
||||
else
|
||||
r <<- list()
|
||||
}
|
||||
|
||||
if (missing(k)) {
|
||||
k <- deparse(substitute(v))
|
||||
}
|
||||
|
||||
## save to the global r variable/list
|
||||
r[[k]] <<- v
|
||||
|
||||
if (!silent) {
|
||||
print(r[[k]])
|
||||
flush.console()
|
||||
}
|
||||
|
||||
invisible(r[[k]])
|
||||
## return(r[[k]])
|
||||
|
||||
saveRDS(r,"remember.RDS")
|
||||
}
|
||||
|
||||
## make sure that appendix and nosave are always defined
|
||||
if (!exists("appendix")) { appendix <- FALSE }
|
||||
if (!exists("nosave")) { nosave <- FALSE }
|
||||
if(!exists("plot.distribtuions")){plot.distributions <- FALSE}
|
||||
basedir <- "."
|
||||
setwd(basedir)
|
||||
include.wikipedia <- FALSE
|
||||
if (!exists("wiki.list")) {
|
||||
subdir <- "userroles_data/"
|
||||
if (!exists(paste0(subdir,"missing.wikis"))){
|
||||
deleted.wikis <- fread(paste0(subdir,"allusers_deleted_merge.txt"),header=FALSE,col.names=c("wiki"))
|
||||
deleted.wikis <- unique(deleted.wikis$wiki)
|
||||
|
||||
notauthorized.wikis <- fread(paste0(subdir,"allusers_notauthorized_merge.txt"),header=FALSE,col.names=c("wiki"))
|
||||
notauthorized.wikis <- unique(notauthorized.wikis$wiki)
|
||||
missing.wikis = c(deleted.wikis, notauthorized.wikis)
|
||||
remember(deleted.wikis)
|
||||
remember(notauthorized.wikis)
|
||||
}
|
||||
|
||||
wiki.list <- fread("selected.wikis.csv")
|
||||
wiki.list <- wiki.list[! (wiki %in% missing.wikis) ]
|
||||
wiki.list[wiki.type=="wikia",filename:=paste0(wiki,".tsv")]
|
||||
|
||||
if(include.wikipedia){
|
||||
matchidx <- wiki.list[wiki.type=="wikipedia",regexec("https://(.*)\\.wikipedia.org",url)]
|
||||
lang <- sapply(regmatches(wiki.list[wiki.type=="wikipedia",url],matchidx),function (l) l[2])
|
||||
lang <- gsub("-","_",lang)
|
||||
wiki.list[wiki.type=="wikipedia",lang := lang]
|
||||
wiki.list[wiki.type=="wikipedia",filename:=paste0(lang,"_wikipedia.tsv")]
|
||||
}
|
||||
else{
|
||||
wiki.list <- wiki.list[wiki.type != "wikipedia"]
|
||||
}
|
||||
|
||||
# wiki.list[,lang := NULL]
|
||||
|
||||
rm(missing.wikis)
|
||||
}
|
||||
|
||||
if (!file.exists("wikis.used")){
|
||||
write(wiki.list$wiki,"wikis.used")
|
||||
}
|
||||
|
||||
options(mc.cores = 16)
|
||||
Reference in New Issue
Block a user