Initial commit
p# new file: runwikiq.sh
This commit is contained in:
86
RCommunityData/R/wikiq.R
Normal file
86
RCommunityData/R/wikiq.R
Normal file
@@ -0,0 +1,86 @@
|
||||
# Community Data Science Collective R Utilities
|
||||
#
|
||||
# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
|
||||
# mako@atdot.cc, aaronshaw@northwestern.edu
|
||||
|
||||
# loads simple utility functions for use in the subsequent files
|
||||
|
||||
# store this for re-use across various scripts
|
||||
wikiq.header <- c("title", "articleid", "revid", "timestamp", "anon",
|
||||
"editor", "editor_id", "minor", "text_size",
|
||||
"text_entropy", "text_md5", "reversion",
|
||||
"additions_size", "deletions_size", "edits",
|
||||
"articles", "users")
|
||||
|
||||
# helper function to load the TSV files our perl scripts are generating
|
||||
load.extracted.df <- function (filename) {
|
||||
read.delim(filename, header=T, quote="", na.strings="", stringsAsFactors=TRUE)
|
||||
}
|
||||
|
||||
# helper function to grab the classes of all columns of a dataframe
|
||||
# keep this because it's being used but this can just be lapply(d, class)
|
||||
get.col.classes <- function (d) {
|
||||
sapply(colnames(d), function (col) { class(d[,col]) })
|
||||
}
|
||||
|
||||
# convert mediawiki timestamps into POSIXct
|
||||
timestamp.to.POSIXct <- function (ts.string) {
|
||||
ts.string <- gsub("T", " ", ts.string)
|
||||
ts.string <- gsub("Z", "", ts.string)
|
||||
return(as.POSIXct(ts.string, format="%Y-%m-%d %H:%M:%S", tz="UTC"))
|
||||
}
|
||||
|
||||
|
||||
read.wikiq <- function (con, header=TRUE, detect.reverts=FALSE) {
|
||||
d <- read.delim(con, stringsAsFactors=FALSE, header=header,
|
||||
encoding="UTF-8", quote="")
|
||||
|
||||
# rename date.time to timestamp and remove _
|
||||
colnames(d)[colnames(d) == "date.time"] <- "timestamp"
|
||||
colnames(d) <- sub("_", ".", colnames(d))
|
||||
|
||||
d$timestamp <- as.POSIXct(sub("^(.*)y(.*)\xc8zy$", "\\1\\2",
|
||||
d$timestamp), tz="UTC")
|
||||
|
||||
# convert reversion to a logical
|
||||
d$reversion <- !is.na(d$reversion)
|
||||
|
||||
if (detect.reverts) {
|
||||
# reorder so we cannow find the order and timestamp
|
||||
d <- d[order(d$title, d$timestamp),]
|
||||
|
||||
# generate a list of reverted editors and a list of previous and next md5
|
||||
d$reverted <- c(d$reversion[2:length(d$reversion)],NA)
|
||||
d$md5.next <- c(d$text.md5[2:length(d$reversion)],NA)
|
||||
d$md5.prev <- c(NA,d$text.md5[1:(length(d$reversion)-1)])
|
||||
d$reverted <- d$reverted & (d$md5.next == d$md5.prev)
|
||||
|
||||
# drop the extra columns and the last edit
|
||||
d <- d[!is.na(d$reverted),]
|
||||
d <- d[,!colnames(d) %in% c("md5.next", "md5.prev")]
|
||||
|
||||
# create a reverted by variable by shifting up the editors and
|
||||
# then NAing nonreverts
|
||||
d$reverted.by <- c(d$editor[2:length(d$reversion)], NA)
|
||||
d$reverted.by[!d$reverted] <- NA
|
||||
}
|
||||
# set ip address to the username and create a new variable
|
||||
d$ipaddress <- d$editor == ""
|
||||
d$editor[d$editor == ""] <- d$editor.id[d$editor == ""]
|
||||
|
||||
# delete the connection
|
||||
return(d)
|
||||
}
|
||||
|
||||
# TODO refactor this so that we clean the data BEFORE we read it into R
|
||||
# ATM, this is set to only work on 14 item issues
|
||||
|
||||
# see the vereins wiki for "Philcomputing" and 29 lines that seem to
|
||||
# have a newline in the editor name
|
||||
read.bz.wikiq <- function (filename, header=TRUE, detect.reverts=FALSE) {
|
||||
con <- pipe(paste("bzcat", filename, "|awk -F'\t' '{if (NF == 14) print;}'"))
|
||||
d <- read.wikiq(con, header=header, detect.reverts=detect.reverts)
|
||||
rm(con)
|
||||
return(d)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user