adds this under git control for a demo today

2022-11-22 12:00:22 -08:00 · 2022-11-22 12:00:22 -08:00 · 97e83d0f09
commit 97e83d0f09
parent a03c08a932
1 changed files with 148 additions and 0 deletions
--- a/R_examples/prepDF.R
+++ b/R_examples/prepDF.R
@ -0,0 +1,148 @@
+
+rm(list=ls())
+
+#####################
+#The purpose of this file is to load up the datasets and clean them for processing.
+#
+#####################
+
+####set globals and make helpers
+#basePath = '/home/kaylea/Research/taboo/'
+basePath = '/gscratch/comdata/users/kaylea/taboo/'
+dataPath = paste0(basePath, 'data/')
+rawPath = paste0(basePath, 'raw_data/')
+botsFile <- paste0(rawPath, 'botList.tsv')
+endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs
+endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S")
+startOfRecords= '2008-09-20 05:23:14'
+startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S")
+
+
+
+library(dplyr)
+library(sqldf)
+library(lubridate)
+library(data.table)
+library(urltools)
+
+
+#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/
+readPileToDF <- function(path) {
+  file_list <- list.files(path)
+  print(file_list)
+  for (my_file in file_list){
+    if (my_file == '_SUCCESS') { #spark metadata file, ignore
+      next
+    }
+    # if the merged dataset doesn't exist, create it
+    if (!exists("dataset")) {
+    print(paste0('Now Reading: ', path, my_file))
+    dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
+    }
+    # if the merged dataset does exist, append to it
+    if (exists("dataset")){
+      temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
+      dataset<-rbind(dataset, temp_dataset)
+      rm(temp_dataset)
+    }
+  }
+  dataset <- unique(dataset)
+  return(dataset)
+}
+
+# Part 1 - load and clean revisions data
+
+
+revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/'))
+revDF$source <- "taboo" 
+#revDF.CTab$taboo <- 1
+revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0
+head(revDF.CTab)
+
+##drop unneeded fields
+revDF.CTab$prediction <- NULL
+revDF.CTab$filtered_title <- NULL
+revDF.CTab$target <- NULL
+
+colnames(revDF.CTab)
+
+revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA
+
+revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF
+
+## eliminate any articles in both:
+
+### prepare bot filter
+botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
+botDF <- unique(botDF) #strip out any repetitions
+#botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
+head(revDF)
+head(botDF)
+botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure
+revDF$editor_id <- as.character(revDF$editor_id)
+head(revDF)
+head(botDF)
+revDF <- setDT(revDF)
+botDF <- setDT(botDF)
+revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list. 
+isABot.tab <- table(revDF$isBot)
+
+## drop all bots here
+revDF.clean <- subset(revDF, revDF$isBot==FALSE)
+revDF <- NULL #so we don't use it accidentally
+revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon))
+
+##### filtering done, now to do some summing-up
+
+### Weighting
+##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum
+#two criteria this meets:
+#sum(weights) = total_revs
+#sum(weights for given article) = sum(weights for all other articles)
+
+numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count
+numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses
+n.revs <- length(revDF.clean$revid) ## total number of revisions
+n.arts <- length(numEdits$encodedTitle) ## total number of articles
+revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle")
+revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle")
+revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits
+
+
+revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive 
+revDF.clean <- revDF.clean %>% mutate(got_reverted =          
+                case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE))
+
+table(revDF.clean$anon)
+revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems 
+table(revDF.clean$anon)
+
+
+##### dropping items with missing revids; if this happens, find out why
+###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)]
+
+
+
+
+artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize(
+        across(revid, length),
+        across(got_reverted, sum),
+	across(date_time, min)
+)
+
+titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source)
+titleSampleDF <- unique(titleSampleDF)
+
+artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from
+artDF$min.birthday <- strptime(artDF$date_time,  "%Y-%m-%d %H:%M:%S")
+artDF$startOfRecords <- startOfRecords
+artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later
+artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs")) 
+## how many seconds old is each article inside the logged scope? 
+
+artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected?
+
+artDF$pct.prot <- artDF.prot$pct.prot
+
+print("saving full image")
+save.image(paste0(dataPath, "dataset1.RData"), version=2)