From 97e83d0f09ac7b5c6aef02b5d8138c1cfc88748b Mon Sep 17 00:00:00 2001 From: Kaylea Champion Date: Tue, 22 Nov 2022 12:00:22 -0800 Subject: [PATCH] adds this under git control for a demo today --- R_examples/prepDF.R | 148 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 R_examples/prepDF.R diff --git a/R_examples/prepDF.R b/R_examples/prepDF.R new file mode 100644 index 0000000..edd862f --- /dev/null +++ b/R_examples/prepDF.R @@ -0,0 +1,148 @@ + +rm(list=ls()) + +##################### +#The purpose of this file is to load up the datasets and clean them for processing. +# +##################### + +####set globals and make helpers +#basePath = '/home/kaylea/Research/taboo/' +basePath = '/gscratch/comdata/users/kaylea/taboo/' +dataPath = paste0(basePath, 'data/') +rawPath = paste0(basePath, 'raw_data/') +botsFile <- paste0(rawPath, 'botList.tsv') +endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs +endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S") +startOfRecords= '2008-09-20 05:23:14' +startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S") + + + +library(dplyr) +library(sqldf) +library(lubridate) +library(data.table) +library(urltools) + + +#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/ +readPileToDF <- function(path) { + file_list <- list.files(path) + print(file_list) + for (my_file in file_list){ + if (my_file == '_SUCCESS') { #spark metadata file, ignore + next + } + # if the merged dataset doesn't exist, create it + if (!exists("dataset")) { + print(paste0('Now Reading: ', path, my_file)) + dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE) + } + # if the merged dataset does exist, append to it + if (exists("dataset")){ + temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE) + dataset<-rbind(dataset, temp_dataset) + rm(temp_dataset) + } + } + dataset <- unique(dataset) + return(dataset) +} + +# Part 1 - load and clean revisions data + + +revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/')) +revDF$source <- "taboo" +#revDF.CTab$taboo <- 1 +revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0 +head(revDF.CTab) + +##drop unneeded fields +revDF.CTab$prediction <- NULL +revDF.CTab$filtered_title <- NULL +revDF.CTab$target <- NULL + +colnames(revDF.CTab) + +revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA + +revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF + +## eliminate any articles in both: + +### prepare bot filter +botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE) +botDF <- unique(botDF) #strip out any repetitions +#botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE) +head(revDF) +head(botDF) +botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure +revDF$editor_id <- as.character(revDF$editor_id) +head(revDF) +head(botDF) +revDF <- setDT(revDF) +botDF <- setDT(botDF) +revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list. +isABot.tab <- table(revDF$isBot) + +## drop all bots here +revDF.clean <- subset(revDF, revDF$isBot==FALSE) +revDF <- NULL #so we don't use it accidentally +revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon)) + +##### filtering done, now to do some summing-up + +### Weighting +##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum +#two criteria this meets: +#sum(weights) = total_revs +#sum(weights for given article) = sum(weights for all other articles) + +numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count +numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses +n.revs <- length(revDF.clean$revid) ## total number of revisions +n.arts <- length(numEdits$encodedTitle) ## total number of articles +revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle") +revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle") +revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits + + +revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive +revDF.clean <- revDF.clean %>% mutate(got_reverted = + case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE)) + +table(revDF.clean$anon) +revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems +table(revDF.clean$anon) + + +##### dropping items with missing revids; if this happens, find out why +###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)] + + + + +artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize( + across(revid, length), + across(got_reverted, sum), + across(date_time, min) +) + +titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source) +titleSampleDF <- unique(titleSampleDF) + +artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from +artDF$min.birthday <- strptime(artDF$date_time, "%Y-%m-%d %H:%M:%S") +artDF$startOfRecords <- startOfRecords +artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later +artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs")) +## how many seconds old is each article inside the logged scope? + +artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected? + +artDF$pct.prot <- artDF.prot$pct.prot + +print("saving full image") +save.image(paste0(dataPath, "dataset1.RData"), version=2)