149 lines
5.5 KiB
R
149 lines
5.5 KiB
R
|
|
rm(list=ls())
|
|
|
|
#####################
|
|
#The purpose of this file is to load up the datasets and clean them for processing.
|
|
#
|
|
#####################
|
|
|
|
####set globals and make helpers
|
|
#basePath = '/home/kaylea/Research/taboo/'
|
|
basePath = '/gscratch/comdata/users/kaylea/taboo/'
|
|
dataPath = paste0(basePath, 'data/')
|
|
rawPath = paste0(basePath, 'raw_data/')
|
|
botsFile <- paste0(rawPath, 'botList.tsv')
|
|
endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs
|
|
endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S")
|
|
startOfRecords= '2008-09-20 05:23:14'
|
|
startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
|
|
library(dplyr)
|
|
library(sqldf)
|
|
library(lubridate)
|
|
library(data.table)
|
|
library(urltools)
|
|
|
|
|
|
#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/
|
|
readPileToDF <- function(path) {
|
|
file_list <- list.files(path)
|
|
print(file_list)
|
|
for (my_file in file_list){
|
|
if (my_file == '_SUCCESS') { #spark metadata file, ignore
|
|
next
|
|
}
|
|
# if the merged dataset doesn't exist, create it
|
|
if (!exists("dataset")) {
|
|
print(paste0('Now Reading: ', path, my_file))
|
|
dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
|
|
}
|
|
# if the merged dataset does exist, append to it
|
|
if (exists("dataset")){
|
|
temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
|
|
dataset<-rbind(dataset, temp_dataset)
|
|
rm(temp_dataset)
|
|
}
|
|
}
|
|
dataset <- unique(dataset)
|
|
return(dataset)
|
|
}
|
|
|
|
# Part 1 - load and clean revisions data
|
|
|
|
|
|
revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/'))
|
|
revDF$source <- "taboo"
|
|
#revDF.CTab$taboo <- 1
|
|
revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0
|
|
head(revDF.CTab)
|
|
|
|
##drop unneeded fields
|
|
revDF.CTab$prediction <- NULL
|
|
revDF.CTab$filtered_title <- NULL
|
|
revDF.CTab$target <- NULL
|
|
|
|
colnames(revDF.CTab)
|
|
|
|
revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA
|
|
|
|
revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF
|
|
|
|
## eliminate any articles in both:
|
|
|
|
### prepare bot filter
|
|
botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
|
|
botDF <- unique(botDF) #strip out any repetitions
|
|
#botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
|
|
head(revDF)
|
|
head(botDF)
|
|
botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure
|
|
revDF$editor_id <- as.character(revDF$editor_id)
|
|
head(revDF)
|
|
head(botDF)
|
|
revDF <- setDT(revDF)
|
|
botDF <- setDT(botDF)
|
|
revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list.
|
|
isABot.tab <- table(revDF$isBot)
|
|
|
|
## drop all bots here
|
|
revDF.clean <- subset(revDF, revDF$isBot==FALSE)
|
|
revDF <- NULL #so we don't use it accidentally
|
|
revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon))
|
|
|
|
##### filtering done, now to do some summing-up
|
|
|
|
### Weighting
|
|
##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum
|
|
#two criteria this meets:
|
|
#sum(weights) = total_revs
|
|
#sum(weights for given article) = sum(weights for all other articles)
|
|
|
|
numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count
|
|
numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses
|
|
n.revs <- length(revDF.clean$revid) ## total number of revisions
|
|
n.arts <- length(numEdits$encodedTitle) ## total number of articles
|
|
revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle")
|
|
revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle")
|
|
revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits
|
|
|
|
|
|
revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive
|
|
revDF.clean <- revDF.clean %>% mutate(got_reverted =
|
|
case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE))
|
|
|
|
table(revDF.clean$anon)
|
|
revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems
|
|
table(revDF.clean$anon)
|
|
|
|
|
|
##### dropping items with missing revids; if this happens, find out why
|
|
###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)]
|
|
|
|
|
|
|
|
|
|
artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize(
|
|
across(revid, length),
|
|
across(got_reverted, sum),
|
|
across(date_time, min)
|
|
)
|
|
|
|
titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source)
|
|
titleSampleDF <- unique(titleSampleDF)
|
|
|
|
artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from
|
|
artDF$min.birthday <- strptime(artDF$date_time, "%Y-%m-%d %H:%M:%S")
|
|
artDF$startOfRecords <- startOfRecords
|
|
artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later
|
|
artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs"))
|
|
## how many seconds old is each article inside the logged scope?
|
|
|
|
artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected?
|
|
|
|
artDF$pct.prot <- artDF.prot$pct.prot
|
|
|
|
print("saving full image")
|
|
save.image(paste0(dataPath, "dataset1.RData"), version=2)
|