rm(list=ls()) ##################### #The purpose of this file is to load up the datasets and clean them for processing. # ##################### ####set globals and make helpers #basePath = '/home/kaylea/Research/taboo/' basePath = '/gscratch/comdata/users/kaylea/taboo/' dataPath = paste0(basePath, 'data/') rawPath = paste0(basePath, 'raw_data/') botsFile <- paste0(rawPath, 'botList.tsv') endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S") startOfRecords= '2008-09-20 05:23:14' startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S") library(dplyr) library(sqldf) library(lubridate) library(data.table) library(urltools) #recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/ readPileToDF <- function(path) { file_list <- list.files(path) print(file_list) for (my_file in file_list){ if (my_file == '_SUCCESS') { #spark metadata file, ignore next } # if the merged dataset doesn't exist, create it if (!exists("dataset")) { print(paste0('Now Reading: ', path, my_file)) dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE) } # if the merged dataset does exist, append to it if (exists("dataset")){ temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE) dataset<-rbind(dataset, temp_dataset) rm(temp_dataset) } } dataset <- unique(dataset) return(dataset) } # Part 1 - load and clean revisions data revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/')) revDF$source <- "taboo" #revDF.CTab$taboo <- 1 revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0 head(revDF.CTab) ##drop unneeded fields revDF.CTab$prediction <- NULL revDF.CTab$filtered_title <- NULL revDF.CTab$target <- NULL colnames(revDF.CTab) revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF ## eliminate any articles in both: ### prepare bot filter botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE) botDF <- unique(botDF) #strip out any repetitions #botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE) head(revDF) head(botDF) botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure revDF$editor_id <- as.character(revDF$editor_id) head(revDF) head(botDF) revDF <- setDT(revDF) botDF <- setDT(botDF) revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list. isABot.tab <- table(revDF$isBot) ## drop all bots here revDF.clean <- subset(revDF, revDF$isBot==FALSE) revDF <- NULL #so we don't use it accidentally revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon)) ##### filtering done, now to do some summing-up ### Weighting ##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum #two criteria this meets: #sum(weights) = total_revs #sum(weights for given article) = sum(weights for all other articles) numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses n.revs <- length(revDF.clean$revid) ## total number of revisions n.arts <- length(numEdits$encodedTitle) ## total number of articles revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle") revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle") revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive revDF.clean <- revDF.clean %>% mutate(got_reverted = case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE)) table(revDF.clean$anon) revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems table(revDF.clean$anon) ##### dropping items with missing revids; if this happens, find out why ###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)] artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize( across(revid, length), across(got_reverted, sum), across(date_time, min) ) titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source) titleSampleDF <- unique(titleSampleDF) artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from artDF$min.birthday <- strptime(artDF$date_time, "%Y-%m-%d %H:%M:%S") artDF$startOfRecords <- startOfRecords artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs")) ## how many seconds old is each article inside the logged scope? artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected? artDF$pct.prot <- artDF.prot$pct.prot print("saving full image") save.image(paste0(dataPath, "dataset1.RData"), version=2)