cdsc_examples_repository/R_examples/prepDF.R


rm(list=ls())

#####################
#The purpose of this file is to load up the datasets and clean them for processing.
#
#####################

####set globals and make helpers
#basePath = '/home/kaylea/Research/taboo/'
basePath = '/gscratch/comdata/users/kaylea/taboo/'
dataPath = paste0(basePath, 'data/')
rawPath = paste0(basePath, 'raw_data/')
botsFile <- paste0(rawPath, 'botList.tsv')
endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs
endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S")
startOfRecords= '2008-09-20 05:23:14'
startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S")


library(dplyr)
library(sqldf)
library(lubridate)
library(data.table)
library(urltools)


#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/
readPileToDF <- function(path) {
  file_list <- list.files(path)
  print(file_list)
  for (my_file in file_list){
    if (my_file == '_SUCCESS') { #spark metadata file, ignore
      next
    }
    # if the merged dataset doesn't exist, create it
    if (!exists("dataset")) {
    print(paste0('Now Reading: ', path, my_file))
    dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
    }
    # if the merged dataset does exist, append to it
    if (exists("dataset")){
      temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
      dataset<-rbind(dataset, temp_dataset)
      rm(temp_dataset)
    }
  }
  dataset <- unique(dataset)
  return(dataset)
}

# Part 1 - load and clean revisions data


revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/'))
revDF$source <- "taboo"
#revDF.CTab$taboo <- 1
revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0
head(revDF.CTab)

##drop unneeded fields
revDF.CTab$prediction <- NULL
revDF.CTab$filtered_title <- NULL
revDF.CTab$target <- NULL

colnames(revDF.CTab)

revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA

revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF

## eliminate any articles in both:

### prepare bot filter
botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
botDF <- unique(botDF) #strip out any repetitions
#botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
head(revDF)
head(botDF)
botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure
revDF$editor_id <- as.character(revDF$editor_id)
head(revDF)
head(botDF)
revDF <- setDT(revDF)
botDF <- setDT(botDF)
revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list.
isABot.tab <- table(revDF$isBot)

## drop all bots here
revDF.clean <- subset(revDF, revDF$isBot==FALSE)
revDF <- NULL #so we don't use it accidentally
revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon))

##### filtering done, now to do some summing-up

### Weighting
##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum
#two criteria this meets:
#sum(weights) = total_revs
#sum(weights for given article) = sum(weights for all other articles)

numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count
numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses
n.revs <- length(revDF.clean$revid) ## total number of revisions
n.arts <- length(numEdits$encodedTitle) ## total number of articles
revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle")
revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle")
revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits


revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive
revDF.clean <- revDF.clean %>% mutate(got_reverted =
                case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE))

table(revDF.clean$anon)
revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems
table(revDF.clean$anon)


##### dropping items with missing revids; if this happens, find out why
###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)]


artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize(
        across(revid, length),
        across(got_reverted, sum),
	across(date_time, min)
)

titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source)
titleSampleDF <- unique(titleSampleDF)

artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from
artDF$min.birthday <- strptime(artDF$date_time,  "%Y-%m-%d %H:%M:%S")
artDF$startOfRecords <- startOfRecords
artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later
artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs"))
## how many seconds old is each article inside the logged scope?

artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected?

artDF$pct.prot <- artDF.prot$pct.prot

print("saving full image")
save.image(paste0(dataPath, "dataset1.RData"), version=2)