1
0
Files
rises_declines_wikia_code/lib-00-utils.R
groceryheist 72633c193b Initial commit
p#	new file:   runwikiq.sh
2018-06-02 15:32:19 -07:00

173 lines
5.7 KiB
R

# Library containing helper functions
# Copyright (C) 2018 Nathan TeBlunthuis
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
library(parallel)
library(urltools)
library(data.table)
library(texreg)
## load wikiq data for all wikis in the wiki list
## this wikiq data doesn't have persistent word revisions
## It doesn't collapse user edits either. we identify user sessions as well
load.wikiq.file <- function(path){
d <- fread(paste0(path),
colClasses=list(character=c("reverteds", "date_time", "editor", "title")),
na.string="", stringsAsFactors=TRUE, quote="",drop=c("sha1","minor"))
gc()
setnames(d, gsub('_', '.', colnames(d)))
setkey(d, "revid")
d$date.time <- as.POSIXct(as.character(d$date.time),
format="%Y-%m-%d %H:%M:%S",
tz="UTC")
d[, ':='(editor = as.factor(url_decode(as.character(editor))), title = as.factor(url_decode(as.character(title))))]
d[d$editor == "127.0.0.1","anon"] <- FALSE
# drop edits made before mediawiki was written
d <- d[d$date.time > as.POSIXct("2002-01-22",timezone="UTC"),]
## drop wikia edits made after 2010-04-10, when data was collected
if(wiki.list$wiki.type == "wikia"){
d <- d[d$date.time < as.POSIXct("2010-04-10",timezone="UTC"),]
}
# created "reverted" which captures whether an edit has been identity
# reverted within the revert RADIUS (currently 15 edits).
if (!any(d$revert)) {
d$reverted <- FALSE
## we need to reorder the columns in this case
## the merge in the other case also reorders columns
setcolorder(d,c("revid",names(d)[!grepl("revid",names(d))]))
} else {
reverteds <- d$reverteds[d$revert]
if (!any(grepl(",", d$reverteds))) {
reverteds <- unique(as.integer(as.character(d$reverteds)))
} else {
reverteds <- unique(as.integer(unlist(strsplit(as.character(reverteds), ","))))
}
reverteds <- data.table(revid=reverteds, reverted=TRUE)
d <- merge(d, reverteds, all.x=TRUE)
d$reverted[is.na(d$reverted)] <- FALSE
}
# "new.id" indicates whether this is a first-time editor
setkey(d, "date.time")
d$new.account <- !duplicated(d$editor)
d$new.account[is.na(d$editor)] <- FALSE
d$total.edits <- length(d$revid)
d$total.sessions <- seq(1, nrow(d))
d$total.editors <- cumsum(d$new.account)
d$total.pages <- cumsum(!duplicated(d$articleid))
## add the wiki name to the dt
## remove edits not in the namespaces we care about
d <- d[namespace %in% c(0,1,3,4),]
return(d)
}
load.wikiq.files <- function(i,wiki.list, path="wikiq_wikia_2010_all_nopersistence/"){
wiki.filename = wiki.list[i,filename]
wiki <- wiki.list[i,wiki]
print(wiki)
d <- load.wikiq.file(paste0(path,wiki.filename))
d$wiki.name <- rep(wiki,nrow(d))
d$wiki.type <- rep(wiki.list[i,wiki.type],nrow(d))
d[,time.first.edit := min(date.time),by=.(editor.id, wiki.name)]
return(d)
}
remember <- function (v, k, silent=FALSE) {
if (!exists("r")){
rfilename = "remember.RDS"
if(file.exists(rfilename)){
r <<- readRDS(rfilename)
}
else
r <<- list()
}
if (missing(k)) {
k <- deparse(substitute(v))
}
## save to the global r variable/list
r[[k]] <<- v
if (!silent) {
print(r[[k]])
flush.console()
}
invisible(r[[k]])
## return(r[[k]])
saveRDS(r,"remember.RDS")
}
## make sure that appendix and nosave are always defined
if (!exists("appendix")) { appendix <- FALSE }
if (!exists("nosave")) { nosave <- FALSE }
if(!exists("plot.distribtuions")){plot.distributions <- FALSE}
basedir <- "."
setwd(basedir)
include.wikipedia <- FALSE
if (!exists("wiki.list")) {
subdir <- "userroles_data/"
if (!exists(paste0(subdir,"missing.wikis"))){
deleted.wikis <- fread(paste0(subdir,"allusers_deleted_merge.txt"),header=FALSE,col.names=c("wiki"))
deleted.wikis <- unique(deleted.wikis$wiki)
notauthorized.wikis <- fread(paste0(subdir,"allusers_notauthorized_merge.txt"),header=FALSE,col.names=c("wiki"))
notauthorized.wikis <- unique(notauthorized.wikis$wiki)
missing.wikis = c(deleted.wikis, notauthorized.wikis)
remember(deleted.wikis)
remember(notauthorized.wikis)
}
wiki.list <- fread("selected.wikis.csv")
wiki.list <- wiki.list[! (wiki %in% missing.wikis) ]
wiki.list[wiki.type=="wikia",filename:=paste0(wiki,".tsv")]
if(include.wikipedia){
matchidx <- wiki.list[wiki.type=="wikipedia",regexec("https://(.*)\\.wikipedia.org",url)]
lang <- sapply(regmatches(wiki.list[wiki.type=="wikipedia",url],matchidx),function (l) l[2])
lang <- gsub("-","_",lang)
wiki.list[wiki.type=="wikipedia",lang := lang]
wiki.list[wiki.type=="wikipedia",filename:=paste0(lang,"_wikipedia.tsv")]
}
else{
wiki.list <- wiki.list[wiki.type != "wikipedia"]
}
# wiki.list[,lang := NULL]
rm(missing.wikis)
}
if (!file.exists("wikis.used")){
write(wiki.list$wiki,"wikis.used")
}
options(mc.cores = 16)