git-annex in kaylea@wumpus:~/Research/cdsc_examples_repository

2022-11-22 12:08:16 -08:00 · 2022-11-22 12:08:16 -08:00 · 8e20edaddd
commit 8e20edaddd
parent 97e83d0f09
3 changed files with 3 additions and 550 deletions
--- a/R_examples/EDA.Rmd
+++ b/R_examples/EDA.Rmd
@ -1,402 +1 @@
---
+/annex/objects/SHA256E-s10456--a2f961c3e0484f5f253aefec4e79c1ede34a64c6a6e56c6f90647dc98809eabc.Rmd
 title: "Taboo EDA"
 author: "Kaylea Champion"
 date: "2/3/2022"
 output: pdf_document
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 library('ggplot2')
 library('scales')
 setwd("~/Research/cdsc_examples_repository/R_examples")
 load('~/Research/cdsc_examples_repository/R_examples/data/EDA.RData')
 reverselog_trans <- function(base = exp(1)) {
    trans <- function(x) -log(x, base)
    inv <- function(x) base^(-x)
    trans_new(paste0("reverselog-", format(base)), trans, inv, 
              log_breaks(base = base), 
              domain = c(1e-100, Inf))
 }
 options(scipen=999)
 ```
 ## Overview
 The purpose of this study is to understand the production and development of taboo topics.
 ### Independent Variables
 - taboo (article)
 ### Dependent Variables
 - article readership (h1)
 - article quality (h2)
 - contribution quality (h3)
 - contributor identifiability (h4)
 ### Important Controls
 - article age (quality tends to grow over time as people have the opportunity to contribute)
 - time (general traffic trends wax and wane)
 ### Relationships to be Tested
 - H1 - taboo topics have higher readership than random articles
 - H2 - taboo topics are lower quality than random articles
 - H3 - taboo topics receive lower quality contributions than random articles
 - H4 - taboo topics are more likely to receive contributions from less identifiable contributors than random articles
 ### Unit(s) of Analysis
 - The article (h1, h2)
 - The revision of the article (h3, h4)
 ## H1 Descriptives: Viewership
 ```{r, H1descriptives,  echo=FALSE,results='hide',fig.keep='all'}
 ##article readership -- vDF
 hist(vDF$viewSum)
 hist(log(vDF$viewSum))
 ```
 We don't use sum, however -- we use view rank. View Rank behaves reasonably well if logged:
 ```{r}
 hist(vDF$viewRankInMyMonth)
 hist(log(vDF$viewRankInMyMonth))
 ## still a model w/time - loess
 g <- ggplot(vDF, aes(x=date, y=viewRankInMyMonth, color=source)) +
  geom_smooth() +
  #scale_y_continuous(trans='reverse') +
  theme_bw() +
  scale_y_continuous(trans=reverselog_trans(10),
                     breaks = trans_breaks("log10", function(x) 10^x),
                     labels = trans_format("log10", math_format(10^.x))) +
  scale_x_date() +
  scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  labs(x="Time", y="View Rank")
 g
 ```
 ```{r H2descriptivesp1}
 ## absolute quality
 g <- ggplot(qDF, aes(x=date, y=weighted_sum, group=encodedTitle, color=source)) +
  #geom_smooth() +
  #scale_y_continuous(trans='reverse') +
  ## change this to show trajectories
  #geom_line(alpha=.05) + #angry R!!
  #geom_point(alpha=.05) +
  theme_bw() +
  geom_rug(alpha=.1) +
 #  scale_y_continuous(trans=reverselog_trans(10),
 #                     breaks = trans_breaks("log10", function(x) 10^x),
 #                     labels = trans_format("log10", math_format(10^.x))) +
  scale_x_date() +
  scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  labs(x="Time", y="Quality")
 g
 ## facet grid 
 g <- ggplot(qDF, aes(x=source, y=weighted_sum)) +
  geom_boxplot()
 g
 table(qDF$source)
 g <- ggplot(qDF, aes(x=weighted_sum)) +
  geom_histogram() + 
  facet_grid(source~., scales='free_y')
 g
 ```
 This graph shows that taboo articles originated much earlier than those obtained with a random sample approach.
 Another key variable is quality --- which again is ranked and logged.
 ```{r H2descriptivesp2}
 ## relative quality
 hist(qDF$weighted_sum)
 ```
 ```{r}
 hist(log(qDF$weighted_sum))
 ```
 ```{r}
 hist(qDF$qualityRankInMyMonth)
 ```
 ```{r}
 hist(log(qDF$qualityRankInMyMonth))
 ```
 ```{r}
 qt <- subset(qDF, qDF$source=='taboo')
 qr <- subset(qDF, qDF$source=='ngram')
 mean(qDF$weighted_sum)
 mean(qr$weighted_sum)
 mean(qt$weighted_sum)
 mean(qr$qualityRankInMyMonth)
 mean(qt$qualityRankInMyMonth)
 g <- ggplot(qDF, aes(x=qualityRankInMyMonth, color=source)) +
  geom_boxplot()
 g
 g <- ggplot(qDF, aes(x=weighted_sum, color=source)) +
  geom_boxplot()
 g
 g <- ggplot(qDF, aes(x=date, y=qualityRankInMyMonth, color=source)) +
  geom_smooth() +
  #scale_y_continuous(trans='reverse') +
  theme_bw() +
  geom_rug(alpha=.1) +
  #scale_y_continuous(trans=reverselog_trans(10),
  #                   breaks = trans_breaks("log10", function(x) 10^x),
  #                   labels = trans_format("log10", math_format(10^.x))) +
  scale_x_date() +
  #scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  labs(x="Time", y="Quality Rank")
 g
 summary(qt$qualityRankInMyMonth)
 summary(qr$qualityRankInMyMonth)
 summary(vDF$viewRankInMyMonth)
 summary(qDF$qualityRankInMyMonth)
 ```
 This examines the same data but with a relative ranking approach for quality. Each article's relative rank is in terms of quality that month, compared to all other articles that were present in Wikipedia at that time. Lower numbers are better (at the top of the graph). 
 ## H3 - Contribution Quality
 ```{r H3descriptives}
 ##revision quality
 artDF$pct_revert <- (artDF$got_reverted / artDF$revid) * 100
 table(artDF$source)
 hist(artDF$got_reverted)
 hist(log(artDF$got_reverted))
 hist(artDF$revid)
 hist(log(artDF$revid))
 cor.test(artDF$pct_revert, as.numeric(artDF$source))
 cor.test(artDF$revid, artDF$pct_revert)
 #drop Barack Obama
 artDF <- subset(artDF, artDF$taboo!='0.5')
 table(artDF$taboo)
 artDF$taboo <- as.logical(artDF$taboo)
 g <- ggplot(artDF, aes(x=revid, y=pct_revert, color=source)) +
  geom_point() +
  labs(x='Number of Revisions', y='Percent Reverted')
 g
 ggplot(artDF, aes(x=got_reverted, color=source)) +
  geom_boxplot()
 g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=got_reverted, color=source)) +
  geom_point(alpha=.2) +
  geom_smooth() +
  geom_rug(alpha=.2)+
  scale_x_continuous(trans='log') +
  scale_y_continuous(trans='log') +
  labs(x='Number of Revisions', y='Number Reverted')
 g
 g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=pct_revert, color=source)) +
  geom_point(alpha=.2) +
  geom_smooth() +
  geom_rug(alpha=.2)+
  scale_x_continuous() +
  scale_y_continuous() +
  labs(x='Number of Revisions', y='Percent Reverted')
 g
 ## pull edits that were later reverted, since they therefore didn't really contribute per se
 revDF.clean.norev <- subset(revDF.clean, revDF.clean$got_reverted==FALSE)
 revDF.clean.norev <- subset(revDF.clean.norev, revDF.clean.norev$revert != '')
 table(revDF.clean.norev$anon)
 print('proportion of reverts by source')
 cleanTable <- table(revDF.clean.norev$source, revDF.clean.norev$revert)
 ftable(cleanTable)
 prop.table(cleanTable, 1)
 prop.table(cleanTable, 2)
 ```
 These results preliminarily suggest a log-log linear model. However, we observe that we do not have support in both samples across the full range of the data. The random sample contains a large number of zeroes.
 ## H4: Anonymity
 H4 tests three limits on identifiability: 
 * contributing without an account
 * contributing with a new account (or, being a relative newcomer)
 * having a short user page.
 We want to make sure we have a reasonable amount of variation and handle zeroes correctly for these measures.
 ### Contributing without an account
 Proportion of Edits that are made without an account:
 ```{r}
 anonTable <- table(revDF.clean.norev$source, revDF.clean.norev$revert, revDF.clean.norev$anon)
 anonTable
 ftable(anonTable)
 prop.table(ftable(anonTable),1)
 xTabAnon <- xtabs(~source+revert+anon,data=revDF.clean.norev)
 ftable(xTabAnon)
 prop.table(ftable(xTabAnon), 1)
 ggplot(revDF.clean.norev, aes(x=anon, color=source)) +
  geom_boxplot()
 ```
 ### Contributing with a new account
 ```{r H4descriptives}
 g <- ggplot(revDF.clean.norev, aes(x=editor_nth_edit_nocollapse)) + 
  geom_histogram() +
  facet_grid(source~., scales='free_y')
 g
 g <- ggplot(revDF.clean.norev, aes(x=log(editor_nth_edit_nocollapse))) +
  geom_histogram() +
  facet_grid(source~., scales='free_y')
 g
 g <- ggplot(revDF.clean.norev, aes(x=editor_nth_edit_nocollapse)) +
  geom_boxplot() +
  facet_grid(source~., scales='free_y')
 g
 t <- subset(revDF.clean.norev, as.logical(revDF.clean.norev$anon)==TRUE)
 mean(t$editor_nth_edit_nocollapse)
 t <- subset(revDF.clean.norev, as.logical(revDF.clean.norev$anon)==FALSE)
 mean(t$editor_nth_edit_nocollapse)
 g <- ggplot(revDF.clean, aes(x=log(editor_nth_edit_nocollapse))) +
  geom_boxplot() +
  facet_grid(source~., scales='free_y')
 g
 ```
 ### Having a short user page
 ```{r}
 summary(revDF.clean$userpage_text_chars)
 g <- ggplot(revDF.clean, aes(x=log1p(userpage_text_chars), group=source)) +
  geom_histogram(binwidth = .5) +
  facet_grid(source~., scales='free_y')
 g
 g <- ggplot(subset(revDF.clean, revDF.clean$userpage_text_chars < exp(4)), aes(x=log1p(userpage_text_chars), group=source)) +
  geom_histogram(binwidth = .5) +
  facet_grid(source~., scales='free_y')
 g
 # look at last edit, what's the 'final' quality?
 library(dplyr)
 finalQDF <- qDF %>% group_by(encodedTitle) %>% filter(timestamp == max(timestamp))
 g <- ggplot(finalQDF, aes(x=weighted_sum, y=source)) +
  geom_boxplot() + 
  labs(x='Final Quality')
 g
 summary(artDF$got_reverted/artDF$revid)
 ```
 ### Gender
 I used the API to find out a bit of user information -- are they emailable, whether gender was specified, and if it was specified, whether the person was female.
 ```{r gender user data}
 table(revDF.clean$gender)
 userDF <- data.frame('editor'=revDF.clean$editor, 'gender' = revDF.clean$gender, 'emailable' = revDF.clean$emailable, 'anon'=revDF.clean$anon)
 userDF <- subset(userDF, userDF$anon == 'false')
 userDF <- unique(userDF)
 table(userDF$gender)
 prop.table(table(revDF.clean$source, revDF.clean$gender), margin = 1)
 gaveGenderDF <- subset(revDF.clean, revDF.clean$gender != 'unknown')
 prop.table(table(gaveGenderDF$source, gaveGenderDF$gender), margin = 1)
 ```
 ### Being Emailable
 ```{r emailable user data}
 table(revDF.clean$emailable)
 prop.table(table(revDF.clean$source, revDF.clean$emailable), margin = 1)
 ```
 ### Protections
 ```{r protection}
 g <- ggplot(artDF, aes(x=pct.prot, group=source)) +
  geom_boxplot() + 
  labs(x='Protection Proportion')
 g
 artDF.prot <- subset(artDF, artDF$pct.prot > 0)
 g <- ggplot(artDF.prot, aes(x=pct.prot, group=source)) +
  geom_boxplot() + 
  labs(x='Protection Proportion (non-zero only)')
 g
 ```
--- a/R_examples/EDA.pdf
+++ b/R_examples/EDA.pdf
@ -0,0 +1 @@
 /annex/objects/SHA256E-s14875097--de4c18a17632e4b432b8689afd182156bafac24a985616c5a5ccc1fa8bfc53bc.pdf
--- a/R_examples/prepDF.R
+++ b/R_examples/prepDF.R
@ -1,148 +1 @@
-
+/annex/objects/SHA256E-s5621--5f1c06d1f3738dce21908a99d5dc1e9ee4ee86fc5af5bbc96bce9ad49dcc5aa2.R
 rm(list=ls())
 #####################
 #The purpose of this file is to load up the datasets and clean them for processing.
 #
 #####################
 ####set globals and make helpers
 #basePath = '/home/kaylea/Research/taboo/'
 basePath = '/gscratch/comdata/users/kaylea/taboo/'
 dataPath = paste0(basePath, 'data/')
 rawPath = paste0(basePath, 'raw_data/')
 botsFile <- paste0(rawPath, 'botList.tsv')
 endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs
 endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S")
 startOfRecords= '2008-09-20 05:23:14'
 startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S")
 library(dplyr)
 library(sqldf)
 library(lubridate)
 library(data.table)
 library(urltools)
 #recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/
 readPileToDF <- function(path) {
  file_list <- list.files(path)
  print(file_list)
  for (my_file in file_list){
    if (my_file == '_SUCCESS') { #spark metadata file, ignore
      next
    }
    # if the merged dataset doesn't exist, create it
    if (!exists("dataset")) {
    print(paste0('Now Reading: ', path, my_file))
    dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
    }
    # if the merged dataset does exist, append to it
    if (exists("dataset")){
      temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
      dataset<-rbind(dataset, temp_dataset)
      rm(temp_dataset)
    }
  }
  dataset <- unique(dataset)
  return(dataset)
 }
 # Part 1 - load and clean revisions data
 revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/'))
 revDF$source <- "taboo" 
 #revDF.CTab$taboo <- 1
 revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0
 head(revDF.CTab)
 ##drop unneeded fields
 revDF.CTab$prediction <- NULL
 revDF.CTab$filtered_title <- NULL
 revDF.CTab$target <- NULL
 colnames(revDF.CTab)
 revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA
 revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF
 ## eliminate any articles in both:
 ### prepare bot filter
 botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
 botDF <- unique(botDF) #strip out any repetitions
 #botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
 head(revDF)
 head(botDF)
 botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure
 revDF$editor_id <- as.character(revDF$editor_id)
 head(revDF)
 head(botDF)
 revDF <- setDT(revDF)
 botDF <- setDT(botDF)
 revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list. 
 isABot.tab <- table(revDF$isBot)
 ## drop all bots here
 revDF.clean <- subset(revDF, revDF$isBot==FALSE)
 revDF <- NULL #so we don't use it accidentally
 revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon))
 ##### filtering done, now to do some summing-up
 ### Weighting
 ##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum
 #two criteria this meets:
 #sum(weights) = total_revs
 #sum(weights for given article) = sum(weights for all other articles)
 numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count
 numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses
 n.revs <- length(revDF.clean$revid) ## total number of revisions
 n.arts <- length(numEdits$encodedTitle) ## total number of articles
 revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle")
 revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle")
 revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits
 revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive 
 revDF.clean <- revDF.clean %>% mutate(got_reverted =          
                case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE))
 table(revDF.clean$anon)
 revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems 
 table(revDF.clean$anon)
 ##### dropping items with missing revids; if this happens, find out why
 ###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)]
 artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize(
        across(revid, length),
        across(got_reverted, sum),
 	across(date_time, min)
 )
 titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source)
 titleSampleDF <- unique(titleSampleDF)
 artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from
 artDF$min.birthday <- strptime(artDF$date_time,  "%Y-%m-%d %H:%M:%S")
 artDF$startOfRecords <- startOfRecords
 artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later
 artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs")) 
 ## how many seconds old is each article inside the logged scope? 
 artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected?
 artDF$pct.prot <- artDF.prot$pct.prot
 print("saving full image")
 save.image(paste0(dataPath, "dataset1.RData"), version=2)
		`@ -0,0 +1 @@`
							`/annex/objects/SHA256E-s14875097--de4c18a17632e4b432b8689afd182156bafac24a985616c5a5ccc1fa8bfc53bc.pdf`