diff --git a/R_examples/EDA.Rmd b/R_examples/EDA.Rmd index 0fae152..8d16901 100644 --- a/R_examples/EDA.Rmd +++ b/R_examples/EDA.Rmd @@ -1,402 +1 @@ ---- -title: "Taboo EDA" -author: "Kaylea Champion" -date: "2/3/2022" -output: pdf_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -library('ggplot2') -library('scales') -setwd("~/Research/cdsc_examples_repository/R_examples") -load('~/Research/cdsc_examples_repository/R_examples/data/EDA.RData') - -reverselog_trans <- function(base = exp(1)) { - trans <- function(x) -log(x, base) - inv <- function(x) base^(-x) - trans_new(paste0("reverselog-", format(base)), trans, inv, - log_breaks(base = base), - domain = c(1e-100, Inf)) -} - -options(scipen=999) -``` - -## Overview - -The purpose of this study is to understand the production and development of taboo topics. - -### Independent Variables -- taboo (article) - -### Dependent Variables -- article readership (h1) -- article quality (h2) -- contribution quality (h3) -- contributor identifiability (h4) - -### Important Controls - -- article age (quality tends to grow over time as people have the opportunity to contribute) -- time (general traffic trends wax and wane) - -### Relationships to be Tested - -- H1 - taboo topics have higher readership than random articles -- H2 - taboo topics are lower quality than random articles -- H3 - taboo topics receive lower quality contributions than random articles -- H4 - taboo topics are more likely to receive contributions from less identifiable contributors than random articles - -### Unit(s) of Analysis - - The article (h1, h2) - - The revision of the article (h3, h4) - -## H1 Descriptives: Viewership - -```{r, H1descriptives, echo=FALSE,results='hide',fig.keep='all'} -##article readership -- vDF - -hist(vDF$viewSum) -hist(log(vDF$viewSum)) -``` - -We don't use sum, however -- we use view rank. View Rank behaves reasonably well if logged: - -```{r} - - -hist(vDF$viewRankInMyMonth) -hist(log(vDF$viewRankInMyMonth)) - -## still a model w/time - loess -g <- ggplot(vDF, aes(x=date, y=viewRankInMyMonth, color=source)) + - geom_smooth() + - #scale_y_continuous(trans='reverse') + - theme_bw() + - scale_y_continuous(trans=reverselog_trans(10), - breaks = trans_breaks("log10", function(x) 10^x), - labels = trans_format("log10", math_format(10^.x))) + - scale_x_date() + - scale_color_discrete(name="Source", labels=c("Random", "Taboo")) + - theme(legend.position = 'bottom', legend.title = element_blank()) + - labs(x="Time", y="View Rank") - -g - -``` - -```{r H2descriptivesp1} - -## absolute quality -g <- ggplot(qDF, aes(x=date, y=weighted_sum, group=encodedTitle, color=source)) + - #geom_smooth() + - #scale_y_continuous(trans='reverse') + - ## change this to show trajectories - #geom_line(alpha=.05) + #angry R!! - #geom_point(alpha=.05) + - theme_bw() + - geom_rug(alpha=.1) + -# scale_y_continuous(trans=reverselog_trans(10), -# breaks = trans_breaks("log10", function(x) 10^x), -# labels = trans_format("log10", math_format(10^.x))) + - scale_x_date() + - scale_color_discrete(name="Source", labels=c("Random", "Taboo")) + - theme(legend.position = 'bottom', legend.title = element_blank()) + - labs(x="Time", y="Quality") - -g - -## facet grid -g <- ggplot(qDF, aes(x=source, y=weighted_sum)) + - geom_boxplot() - -g - -table(qDF$source) - -g <- ggplot(qDF, aes(x=weighted_sum)) + - geom_histogram() + - facet_grid(source~., scales='free_y') - -g - -``` - -This graph shows that taboo articles originated much earlier than those obtained with a random sample approach. - -Another key variable is quality --- which again is ranked and logged. - -```{r H2descriptivesp2} -## relative quality - -hist(qDF$weighted_sum) -``` - -```{r} -hist(log(qDF$weighted_sum)) -``` - -```{r} -hist(qDF$qualityRankInMyMonth) -``` - -```{r} -hist(log(qDF$qualityRankInMyMonth)) -``` - -```{r} - -qt <- subset(qDF, qDF$source=='taboo') -qr <- subset(qDF, qDF$source=='ngram') - -mean(qDF$weighted_sum) -mean(qr$weighted_sum) -mean(qt$weighted_sum) -mean(qr$qualityRankInMyMonth) -mean(qt$qualityRankInMyMonth) - -g <- ggplot(qDF, aes(x=qualityRankInMyMonth, color=source)) + - geom_boxplot() - -g - -g <- ggplot(qDF, aes(x=weighted_sum, color=source)) + - geom_boxplot() - -g - -g <- ggplot(qDF, aes(x=date, y=qualityRankInMyMonth, color=source)) + - geom_smooth() + - #scale_y_continuous(trans='reverse') + - theme_bw() + - geom_rug(alpha=.1) + - #scale_y_continuous(trans=reverselog_trans(10), - # breaks = trans_breaks("log10", function(x) 10^x), - # labels = trans_format("log10", math_format(10^.x))) + - scale_x_date() + - #scale_color_discrete(name="Source", labels=c("Random", "Taboo")) + - theme(legend.position = 'bottom', legend.title = element_blank()) + - labs(x="Time", y="Quality Rank") - -g - -summary(qt$qualityRankInMyMonth) -summary(qr$qualityRankInMyMonth) -summary(vDF$viewRankInMyMonth) -summary(qDF$qualityRankInMyMonth) - - -``` - -This examines the same data but with a relative ranking approach for quality. Each article's relative rank is in terms of quality that month, compared to all other articles that were present in Wikipedia at that time. Lower numbers are better (at the top of the graph). - -## H3 - Contribution Quality - -```{r H3descriptives} -##revision quality -artDF$pct_revert <- (artDF$got_reverted / artDF$revid) * 100 -table(artDF$source) -hist(artDF$got_reverted) -hist(log(artDF$got_reverted)) -hist(artDF$revid) -hist(log(artDF$revid)) - -cor.test(artDF$pct_revert, as.numeric(artDF$source)) -cor.test(artDF$revid, artDF$pct_revert) - -#drop Barack Obama -artDF <- subset(artDF, artDF$taboo!='0.5') -table(artDF$taboo) -artDF$taboo <- as.logical(artDF$taboo) - -g <- ggplot(artDF, aes(x=revid, y=pct_revert, color=source)) + - geom_point() + - labs(x='Number of Revisions', y='Percent Reverted') - -g - -ggplot(artDF, aes(x=got_reverted, color=source)) + - geom_boxplot() - -g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=got_reverted, color=source)) + - geom_point(alpha=.2) + - geom_smooth() + - geom_rug(alpha=.2)+ - scale_x_continuous(trans='log') + - scale_y_continuous(trans='log') + - labs(x='Number of Revisions', y='Number Reverted') - -g - -g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=pct_revert, color=source)) + - geom_point(alpha=.2) + - geom_smooth() + - geom_rug(alpha=.2)+ - scale_x_continuous() + - scale_y_continuous() + - labs(x='Number of Revisions', y='Percent Reverted') - -g - -## pull edits that were later reverted, since they therefore didn't really contribute per se -revDF.clean.norev <- subset(revDF.clean, revDF.clean$got_reverted==FALSE) -revDF.clean.norev <- subset(revDF.clean.norev, revDF.clean.norev$revert != '') -table(revDF.clean.norev$anon) - -print('proportion of reverts by source') -cleanTable <- table(revDF.clean.norev$source, revDF.clean.norev$revert) -ftable(cleanTable) -prop.table(cleanTable, 1) -prop.table(cleanTable, 2) - -``` - -These results preliminarily suggest a log-log linear model. However, we observe that we do not have support in both samples across the full range of the data. The random sample contains a large number of zeroes. - -## H4: Anonymity - -H4 tests three limits on identifiability: -* contributing without an account -* contributing with a new account (or, being a relative newcomer) -* having a short user page. - -We want to make sure we have a reasonable amount of variation and handle zeroes correctly for these measures. -### Contributing without an account - -Proportion of Edits that are made without an account: -```{r} - -anonTable <- table(revDF.clean.norev$source, revDF.clean.norev$revert, revDF.clean.norev$anon) -anonTable -ftable(anonTable) -prop.table(ftable(anonTable),1) - -xTabAnon <- xtabs(~source+revert+anon,data=revDF.clean.norev) -ftable(xTabAnon) -prop.table(ftable(xTabAnon), 1) - -ggplot(revDF.clean.norev, aes(x=anon, color=source)) + - geom_boxplot() -``` - - -### Contributing with a new account -```{r H4descriptives} - -g <- ggplot(revDF.clean.norev, aes(x=editor_nth_edit_nocollapse)) + - geom_histogram() + - facet_grid(source~., scales='free_y') - -g - -g <- ggplot(revDF.clean.norev, aes(x=log(editor_nth_edit_nocollapse))) + - geom_histogram() + - facet_grid(source~., scales='free_y') - -g - -g <- ggplot(revDF.clean.norev, aes(x=editor_nth_edit_nocollapse)) + - geom_boxplot() + - facet_grid(source~., scales='free_y') - -g - -t <- subset(revDF.clean.norev, as.logical(revDF.clean.norev$anon)==TRUE) -mean(t$editor_nth_edit_nocollapse) - -t <- subset(revDF.clean.norev, as.logical(revDF.clean.norev$anon)==FALSE) -mean(t$editor_nth_edit_nocollapse) - -g <- ggplot(revDF.clean, aes(x=log(editor_nth_edit_nocollapse))) + - geom_boxplot() + - facet_grid(source~., scales='free_y') - -g - -``` - -### Having a short user page - -```{r} -summary(revDF.clean$userpage_text_chars) - -g <- ggplot(revDF.clean, aes(x=log1p(userpage_text_chars), group=source)) + - geom_histogram(binwidth = .5) + - facet_grid(source~., scales='free_y') - -g - -g <- ggplot(subset(revDF.clean, revDF.clean$userpage_text_chars < exp(4)), aes(x=log1p(userpage_text_chars), group=source)) + - geom_histogram(binwidth = .5) + - facet_grid(source~., scales='free_y') - -g - - -# look at last edit, what's the 'final' quality? -library(dplyr) -finalQDF <- qDF %>% group_by(encodedTitle) %>% filter(timestamp == max(timestamp)) - -g <- ggplot(finalQDF, aes(x=weighted_sum, y=source)) + - geom_boxplot() + - labs(x='Final Quality') - -g - -summary(artDF$got_reverted/artDF$revid) -``` - - -### Gender - -I used the API to find out a bit of user information -- are they emailable, whether gender was specified, and if it was specified, whether the person was female. - -```{r gender user data} - -table(revDF.clean$gender) -userDF <- data.frame('editor'=revDF.clean$editor, 'gender' = revDF.clean$gender, 'emailable' = revDF.clean$emailable, 'anon'=revDF.clean$anon) - -userDF <- subset(userDF, userDF$anon == 'false') -userDF <- unique(userDF) -table(userDF$gender) - -prop.table(table(revDF.clean$source, revDF.clean$gender), margin = 1) -gaveGenderDF <- subset(revDF.clean, revDF.clean$gender != 'unknown') -prop.table(table(gaveGenderDF$source, gaveGenderDF$gender), margin = 1) - -``` - - -### Being Emailable - - -```{r emailable user data} - -table(revDF.clean$emailable) - -prop.table(table(revDF.clean$source, revDF.clean$emailable), margin = 1) - -``` - -### Protections - -```{r protection} - - -g <- ggplot(artDF, aes(x=pct.prot, group=source)) + - geom_boxplot() + - labs(x='Protection Proportion') - -g - -artDF.prot <- subset(artDF, artDF$pct.prot > 0) - -g <- ggplot(artDF.prot, aes(x=pct.prot, group=source)) + - geom_boxplot() + - labs(x='Protection Proportion (non-zero only)') - -g - - -``` \ No newline at end of file +/annex/objects/SHA256E-s10456--a2f961c3e0484f5f253aefec4e79c1ede34a64c6a6e56c6f90647dc98809eabc.Rmd diff --git a/R_examples/EDA.pdf b/R_examples/EDA.pdf new file mode 100644 index 0000000..5eafb7b --- /dev/null +++ b/R_examples/EDA.pdf @@ -0,0 +1 @@ +/annex/objects/SHA256E-s14875097--de4c18a17632e4b432b8689afd182156bafac24a985616c5a5ccc1fa8bfc53bc.pdf diff --git a/R_examples/prepDF.R b/R_examples/prepDF.R index edd862f..c5c4d40 100644 --- a/R_examples/prepDF.R +++ b/R_examples/prepDF.R @@ -1,148 +1 @@ - -rm(list=ls()) - -##################### -#The purpose of this file is to load up the datasets and clean them for processing. -# -##################### - -####set globals and make helpers -#basePath = '/home/kaylea/Research/taboo/' -basePath = '/gscratch/comdata/users/kaylea/taboo/' -dataPath = paste0(basePath, 'data/') -rawPath = paste0(basePath, 'raw_data/') -botsFile <- paste0(rawPath, 'botList.tsv') -endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs -endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S") -startOfRecords= '2008-09-20 05:23:14' -startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S") - - - -library(dplyr) -library(sqldf) -library(lubridate) -library(data.table) -library(urltools) - - -#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/ -readPileToDF <- function(path) { - file_list <- list.files(path) - print(file_list) - for (my_file in file_list){ - if (my_file == '_SUCCESS') { #spark metadata file, ignore - next - } - # if the merged dataset doesn't exist, create it - if (!exists("dataset")) { - print(paste0('Now Reading: ', path, my_file)) - dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE) - } - # if the merged dataset does exist, append to it - if (exists("dataset")){ - temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE) - dataset<-rbind(dataset, temp_dataset) - rm(temp_dataset) - } - } - dataset <- unique(dataset) - return(dataset) -} - -# Part 1 - load and clean revisions data - - -revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/')) -revDF$source <- "taboo" -#revDF.CTab$taboo <- 1 -revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0 -head(revDF.CTab) - -##drop unneeded fields -revDF.CTab$prediction <- NULL -revDF.CTab$filtered_title <- NULL -revDF.CTab$target <- NULL - -colnames(revDF.CTab) - -revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA - -revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF - -## eliminate any articles in both: - -### prepare bot filter -botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE) -botDF <- unique(botDF) #strip out any repetitions -#botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE) -head(revDF) -head(botDF) -botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure -revDF$editor_id <- as.character(revDF$editor_id) -head(revDF) -head(botDF) -revDF <- setDT(revDF) -botDF <- setDT(botDF) -revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list. -isABot.tab <- table(revDF$isBot) - -## drop all bots here -revDF.clean <- subset(revDF, revDF$isBot==FALSE) -revDF <- NULL #so we don't use it accidentally -revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon)) - -##### filtering done, now to do some summing-up - -### Weighting -##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum -#two criteria this meets: -#sum(weights) = total_revs -#sum(weights for given article) = sum(weights for all other articles) - -numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count -numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses -n.revs <- length(revDF.clean$revid) ## total number of revisions -n.arts <- length(numEdits$encodedTitle) ## total number of articles -revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle") -revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle") -revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits - - -revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive -revDF.clean <- revDF.clean %>% mutate(got_reverted = - case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE)) - -table(revDF.clean$anon) -revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems -table(revDF.clean$anon) - - -##### dropping items with missing revids; if this happens, find out why -###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)] - - - - -artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize( - across(revid, length), - across(got_reverted, sum), - across(date_time, min) -) - -titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source) -titleSampleDF <- unique(titleSampleDF) - -artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from -artDF$min.birthday <- strptime(artDF$date_time, "%Y-%m-%d %H:%M:%S") -artDF$startOfRecords <- startOfRecords -artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later -artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs")) -## how many seconds old is each article inside the logged scope? - -artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected? - -artDF$pct.prot <- artDF.prot$pct.prot - -print("saving full image") -save.image(paste0(dataPath, "dataset1.RData"), version=2) +/annex/objects/SHA256E-s5621--5f1c06d1f3738dce21908a99d5dc1e9ee4ee86fc5af5bbc96bce9ad49dcc5aa2.R