adds this under git control for a demo today
This commit is contained in:
parent
a03c08a932
commit
97e83d0f09
148
R_examples/prepDF.R
Normal file
148
R_examples/prepDF.R
Normal file
@ -0,0 +1,148 @@
|
||||
|
||||
rm(list=ls())
|
||||
|
||||
#####################
|
||||
#The purpose of this file is to load up the datasets and clean them for processing.
|
||||
#
|
||||
#####################
|
||||
|
||||
####set globals and make helpers
|
||||
#basePath = '/home/kaylea/Research/taboo/'
|
||||
basePath = '/gscratch/comdata/users/kaylea/taboo/'
|
||||
dataPath = paste0(basePath, 'data/')
|
||||
rawPath = paste0(basePath, 'raw_data/')
|
||||
botsFile <- paste0(rawPath, 'botList.tsv')
|
||||
endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs
|
||||
endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S")
|
||||
startOfRecords= '2008-09-20 05:23:14'
|
||||
startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
||||
|
||||
library(dplyr)
|
||||
library(sqldf)
|
||||
library(lubridate)
|
||||
library(data.table)
|
||||
library(urltools)
|
||||
|
||||
|
||||
#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/
|
||||
readPileToDF <- function(path) {
|
||||
file_list <- list.files(path)
|
||||
print(file_list)
|
||||
for (my_file in file_list){
|
||||
if (my_file == '_SUCCESS') { #spark metadata file, ignore
|
||||
next
|
||||
}
|
||||
# if the merged dataset doesn't exist, create it
|
||||
if (!exists("dataset")) {
|
||||
print(paste0('Now Reading: ', path, my_file))
|
||||
dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
|
||||
}
|
||||
# if the merged dataset does exist, append to it
|
||||
if (exists("dataset")){
|
||||
temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
|
||||
dataset<-rbind(dataset, temp_dataset)
|
||||
rm(temp_dataset)
|
||||
}
|
||||
}
|
||||
dataset <- unique(dataset)
|
||||
return(dataset)
|
||||
}
|
||||
|
||||
# Part 1 - load and clean revisions data
|
||||
|
||||
|
||||
revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/'))
|
||||
revDF$source <- "taboo"
|
||||
#revDF.CTab$taboo <- 1
|
||||
revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0
|
||||
head(revDF.CTab)
|
||||
|
||||
##drop unneeded fields
|
||||
revDF.CTab$prediction <- NULL
|
||||
revDF.CTab$filtered_title <- NULL
|
||||
revDF.CTab$target <- NULL
|
||||
|
||||
colnames(revDF.CTab)
|
||||
|
||||
revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA
|
||||
|
||||
revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF
|
||||
|
||||
## eliminate any articles in both:
|
||||
|
||||
### prepare bot filter
|
||||
botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
|
||||
botDF <- unique(botDF) #strip out any repetitions
|
||||
#botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
|
||||
head(revDF)
|
||||
head(botDF)
|
||||
botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure
|
||||
revDF$editor_id <- as.character(revDF$editor_id)
|
||||
head(revDF)
|
||||
head(botDF)
|
||||
revDF <- setDT(revDF)
|
||||
botDF <- setDT(botDF)
|
||||
revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list.
|
||||
isABot.tab <- table(revDF$isBot)
|
||||
|
||||
## drop all bots here
|
||||
revDF.clean <- subset(revDF, revDF$isBot==FALSE)
|
||||
revDF <- NULL #so we don't use it accidentally
|
||||
revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon))
|
||||
|
||||
##### filtering done, now to do some summing-up
|
||||
|
||||
### Weighting
|
||||
##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum
|
||||
#two criteria this meets:
|
||||
#sum(weights) = total_revs
|
||||
#sum(weights for given article) = sum(weights for all other articles)
|
||||
|
||||
numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count
|
||||
numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses
|
||||
n.revs <- length(revDF.clean$revid) ## total number of revisions
|
||||
n.arts <- length(numEdits$encodedTitle) ## total number of articles
|
||||
revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle")
|
||||
revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle")
|
||||
revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits
|
||||
|
||||
|
||||
revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive
|
||||
revDF.clean <- revDF.clean %>% mutate(got_reverted =
|
||||
case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE))
|
||||
|
||||
table(revDF.clean$anon)
|
||||
revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems
|
||||
table(revDF.clean$anon)
|
||||
|
||||
|
||||
##### dropping items with missing revids; if this happens, find out why
|
||||
###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)]
|
||||
|
||||
|
||||
|
||||
|
||||
artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize(
|
||||
across(revid, length),
|
||||
across(got_reverted, sum),
|
||||
across(date_time, min)
|
||||
)
|
||||
|
||||
titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source)
|
||||
titleSampleDF <- unique(titleSampleDF)
|
||||
|
||||
artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from
|
||||
artDF$min.birthday <- strptime(artDF$date_time, "%Y-%m-%d %H:%M:%S")
|
||||
artDF$startOfRecords <- startOfRecords
|
||||
artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later
|
||||
artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs"))
|
||||
## how many seconds old is each article inside the logged scope?
|
||||
|
||||
artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected?
|
||||
|
||||
artDF$pct.prot <- artDF.prot$pct.prot
|
||||
|
||||
print("saving full image")
|
||||
save.image(paste0(dataPath, "dataset1.RData"), version=2)
|
Loading…
Reference in New Issue
Block a user