18
0

git-annex in kaylea@wumpus:~/Research/cdsc_examples_repository

This commit is contained in:
Kaylea Champion 2022-11-22 12:08:16 -08:00
parent 97e83d0f09
commit 8e20edaddd
3 changed files with 3 additions and 550 deletions

View File

@ -1,402 +1 @@
--- /annex/objects/SHA256E-s10456--a2f961c3e0484f5f253aefec4e79c1ede34a64c6a6e56c6f90647dc98809eabc.Rmd
title: "Taboo EDA"
author: "Kaylea Champion"
date: "2/3/2022"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library('ggplot2')
library('scales')
setwd("~/Research/cdsc_examples_repository/R_examples")
load('~/Research/cdsc_examples_repository/R_examples/data/EDA.RData')
reverselog_trans <- function(base = exp(1)) {
trans <- function(x) -log(x, base)
inv <- function(x) base^(-x)
trans_new(paste0("reverselog-", format(base)), trans, inv,
log_breaks(base = base),
domain = c(1e-100, Inf))
}
options(scipen=999)
```
## Overview
The purpose of this study is to understand the production and development of taboo topics.
### Independent Variables
- taboo (article)
### Dependent Variables
- article readership (h1)
- article quality (h2)
- contribution quality (h3)
- contributor identifiability (h4)
### Important Controls
- article age (quality tends to grow over time as people have the opportunity to contribute)
- time (general traffic trends wax and wane)
### Relationships to be Tested
- H1 - taboo topics have higher readership than random articles
- H2 - taboo topics are lower quality than random articles
- H3 - taboo topics receive lower quality contributions than random articles
- H4 - taboo topics are more likely to receive contributions from less identifiable contributors than random articles
### Unit(s) of Analysis
- The article (h1, h2)
- The revision of the article (h3, h4)
## H1 Descriptives: Viewership
```{r, H1descriptives, echo=FALSE,results='hide',fig.keep='all'}
##article readership -- vDF
hist(vDF$viewSum)
hist(log(vDF$viewSum))
```
We don't use sum, however -- we use view rank. View Rank behaves reasonably well if logged:
```{r}
hist(vDF$viewRankInMyMonth)
hist(log(vDF$viewRankInMyMonth))
## still a model w/time - loess
g <- ggplot(vDF, aes(x=date, y=viewRankInMyMonth, color=source)) +
geom_smooth() +
#scale_y_continuous(trans='reverse') +
theme_bw() +
scale_y_continuous(trans=reverselog_trans(10),
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
scale_x_date() +
scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
theme(legend.position = 'bottom', legend.title = element_blank()) +
labs(x="Time", y="View Rank")
g
```
```{r H2descriptivesp1}
## absolute quality
g <- ggplot(qDF, aes(x=date, y=weighted_sum, group=encodedTitle, color=source)) +
#geom_smooth() +
#scale_y_continuous(trans='reverse') +
## change this to show trajectories
#geom_line(alpha=.05) + #angry R!!
#geom_point(alpha=.05) +
theme_bw() +
geom_rug(alpha=.1) +
# scale_y_continuous(trans=reverselog_trans(10),
# breaks = trans_breaks("log10", function(x) 10^x),
# labels = trans_format("log10", math_format(10^.x))) +
scale_x_date() +
scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
theme(legend.position = 'bottom', legend.title = element_blank()) +
labs(x="Time", y="Quality")
g
## facet grid
g <- ggplot(qDF, aes(x=source, y=weighted_sum)) +
geom_boxplot()
g
table(qDF$source)
g <- ggplot(qDF, aes(x=weighted_sum)) +
geom_histogram() +
facet_grid(source~., scales='free_y')
g
```
This graph shows that taboo articles originated much earlier than those obtained with a random sample approach.
Another key variable is quality --- which again is ranked and logged.
```{r H2descriptivesp2}
## relative quality
hist(qDF$weighted_sum)
```
```{r}
hist(log(qDF$weighted_sum))
```
```{r}
hist(qDF$qualityRankInMyMonth)
```
```{r}
hist(log(qDF$qualityRankInMyMonth))
```
```{r}
qt <- subset(qDF, qDF$source=='taboo')
qr <- subset(qDF, qDF$source=='ngram')
mean(qDF$weighted_sum)
mean(qr$weighted_sum)
mean(qt$weighted_sum)
mean(qr$qualityRankInMyMonth)
mean(qt$qualityRankInMyMonth)
g <- ggplot(qDF, aes(x=qualityRankInMyMonth, color=source)) +
geom_boxplot()
g
g <- ggplot(qDF, aes(x=weighted_sum, color=source)) +
geom_boxplot()
g
g <- ggplot(qDF, aes(x=date, y=qualityRankInMyMonth, color=source)) +
geom_smooth() +
#scale_y_continuous(trans='reverse') +
theme_bw() +
geom_rug(alpha=.1) +
#scale_y_continuous(trans=reverselog_trans(10),
# breaks = trans_breaks("log10", function(x) 10^x),
# labels = trans_format("log10", math_format(10^.x))) +
scale_x_date() +
#scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
theme(legend.position = 'bottom', legend.title = element_blank()) +
labs(x="Time", y="Quality Rank")
g
summary(qt$qualityRankInMyMonth)
summary(qr$qualityRankInMyMonth)
summary(vDF$viewRankInMyMonth)
summary(qDF$qualityRankInMyMonth)
```
This examines the same data but with a relative ranking approach for quality. Each article's relative rank is in terms of quality that month, compared to all other articles that were present in Wikipedia at that time. Lower numbers are better (at the top of the graph).
## H3 - Contribution Quality
```{r H3descriptives}
##revision quality
artDF$pct_revert <- (artDF$got_reverted / artDF$revid) * 100
table(artDF$source)
hist(artDF$got_reverted)
hist(log(artDF$got_reverted))
hist(artDF$revid)
hist(log(artDF$revid))
cor.test(artDF$pct_revert, as.numeric(artDF$source))
cor.test(artDF$revid, artDF$pct_revert)
#drop Barack Obama
artDF <- subset(artDF, artDF$taboo!='0.5')
table(artDF$taboo)
artDF$taboo <- as.logical(artDF$taboo)
g <- ggplot(artDF, aes(x=revid, y=pct_revert, color=source)) +
geom_point() +
labs(x='Number of Revisions', y='Percent Reverted')
g
ggplot(artDF, aes(x=got_reverted, color=source)) +
geom_boxplot()
g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=got_reverted, color=source)) +
geom_point(alpha=.2) +
geom_smooth() +
geom_rug(alpha=.2)+
scale_x_continuous(trans='log') +
scale_y_continuous(trans='log') +
labs(x='Number of Revisions', y='Number Reverted')
g
g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=pct_revert, color=source)) +
geom_point(alpha=.2) +
geom_smooth() +
geom_rug(alpha=.2)+
scale_x_continuous() +
scale_y_continuous() +
labs(x='Number of Revisions', y='Percent Reverted')
g
## pull edits that were later reverted, since they therefore didn't really contribute per se
revDF.clean.norev <- subset(revDF.clean, revDF.clean$got_reverted==FALSE)
revDF.clean.norev <- subset(revDF.clean.norev, revDF.clean.norev$revert != '')
table(revDF.clean.norev$anon)
print('proportion of reverts by source')
cleanTable <- table(revDF.clean.norev$source, revDF.clean.norev$revert)
ftable(cleanTable)
prop.table(cleanTable, 1)
prop.table(cleanTable, 2)
```
These results preliminarily suggest a log-log linear model. However, we observe that we do not have support in both samples across the full range of the data. The random sample contains a large number of zeroes.
## H4: Anonymity
H4 tests three limits on identifiability:
* contributing without an account
* contributing with a new account (or, being a relative newcomer)
* having a short user page.
We want to make sure we have a reasonable amount of variation and handle zeroes correctly for these measures.
### Contributing without an account
Proportion of Edits that are made without an account:
```{r}
anonTable <- table(revDF.clean.norev$source, revDF.clean.norev$revert, revDF.clean.norev$anon)
anonTable
ftable(anonTable)
prop.table(ftable(anonTable),1)
xTabAnon <- xtabs(~source+revert+anon,data=revDF.clean.norev)
ftable(xTabAnon)
prop.table(ftable(xTabAnon), 1)
ggplot(revDF.clean.norev, aes(x=anon, color=source)) +
geom_boxplot()
```
### Contributing with a new account
```{r H4descriptives}
g <- ggplot(revDF.clean.norev, aes(x=editor_nth_edit_nocollapse)) +
geom_histogram() +
facet_grid(source~., scales='free_y')
g
g <- ggplot(revDF.clean.norev, aes(x=log(editor_nth_edit_nocollapse))) +
geom_histogram() +
facet_grid(source~., scales='free_y')
g
g <- ggplot(revDF.clean.norev, aes(x=editor_nth_edit_nocollapse)) +
geom_boxplot() +
facet_grid(source~., scales='free_y')
g
t <- subset(revDF.clean.norev, as.logical(revDF.clean.norev$anon)==TRUE)
mean(t$editor_nth_edit_nocollapse)
t <- subset(revDF.clean.norev, as.logical(revDF.clean.norev$anon)==FALSE)
mean(t$editor_nth_edit_nocollapse)
g <- ggplot(revDF.clean, aes(x=log(editor_nth_edit_nocollapse))) +
geom_boxplot() +
facet_grid(source~., scales='free_y')
g
```
### Having a short user page
```{r}
summary(revDF.clean$userpage_text_chars)
g <- ggplot(revDF.clean, aes(x=log1p(userpage_text_chars), group=source)) +
geom_histogram(binwidth = .5) +
facet_grid(source~., scales='free_y')
g
g <- ggplot(subset(revDF.clean, revDF.clean$userpage_text_chars < exp(4)), aes(x=log1p(userpage_text_chars), group=source)) +
geom_histogram(binwidth = .5) +
facet_grid(source~., scales='free_y')
g
# look at last edit, what's the 'final' quality?
library(dplyr)
finalQDF <- qDF %>% group_by(encodedTitle) %>% filter(timestamp == max(timestamp))
g <- ggplot(finalQDF, aes(x=weighted_sum, y=source)) +
geom_boxplot() +
labs(x='Final Quality')
g
summary(artDF$got_reverted/artDF$revid)
```
### Gender
I used the API to find out a bit of user information -- are they emailable, whether gender was specified, and if it was specified, whether the person was female.
```{r gender user data}
table(revDF.clean$gender)
userDF <- data.frame('editor'=revDF.clean$editor, 'gender' = revDF.clean$gender, 'emailable' = revDF.clean$emailable, 'anon'=revDF.clean$anon)
userDF <- subset(userDF, userDF$anon == 'false')
userDF <- unique(userDF)
table(userDF$gender)
prop.table(table(revDF.clean$source, revDF.clean$gender), margin = 1)
gaveGenderDF <- subset(revDF.clean, revDF.clean$gender != 'unknown')
prop.table(table(gaveGenderDF$source, gaveGenderDF$gender), margin = 1)
```
### Being Emailable
```{r emailable user data}
table(revDF.clean$emailable)
prop.table(table(revDF.clean$source, revDF.clean$emailable), margin = 1)
```
### Protections
```{r protection}
g <- ggplot(artDF, aes(x=pct.prot, group=source)) +
geom_boxplot() +
labs(x='Protection Proportion')
g
artDF.prot <- subset(artDF, artDF$pct.prot > 0)
g <- ggplot(artDF.prot, aes(x=pct.prot, group=source)) +
geom_boxplot() +
labs(x='Protection Proportion (non-zero only)')
g
```

1
R_examples/EDA.pdf Normal file
View File

@ -0,0 +1 @@
/annex/objects/SHA256E-s14875097--de4c18a17632e4b432b8689afd182156bafac24a985616c5a5ccc1fa8bfc53bc.pdf

View File

@ -1,148 +1 @@
/annex/objects/SHA256E-s5621--5f1c06d1f3738dce21908a99d5dc1e9ee4ee86fc5af5bbc96bce9ad49dcc5aa2.R
rm(list=ls())
#####################
#The purpose of this file is to load up the datasets and clean them for processing.
#
#####################
####set globals and make helpers
#basePath = '/home/kaylea/Research/taboo/'
basePath = '/gscratch/comdata/users/kaylea/taboo/'
dataPath = paste0(basePath, 'data/')
rawPath = paste0(basePath, 'raw_data/')
botsFile <- paste0(rawPath, 'botList.tsv')
endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs
endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S")
startOfRecords= '2008-09-20 05:23:14'
startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S")
library(dplyr)
library(sqldf)
library(lubridate)
library(data.table)
library(urltools)
#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/
readPileToDF <- function(path) {
file_list <- list.files(path)
print(file_list)
for (my_file in file_list){
if (my_file == '_SUCCESS') { #spark metadata file, ignore
next
}
# if the merged dataset doesn't exist, create it
if (!exists("dataset")) {
print(paste0('Now Reading: ', path, my_file))
dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
}
# if the merged dataset does exist, append to it
if (exists("dataset")){
temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
dataset<-rbind(dataset, temp_dataset)
rm(temp_dataset)
}
}
dataset <- unique(dataset)
return(dataset)
}
# Part 1 - load and clean revisions data
revDF = readPileToDF(paste0(coefPath, 'revDataPlusUPL/'))
revDF$source <- "taboo"
#revDF.CTab$taboo <- 1
revDF$userpage_text_chars[is.na(revDF$userpage_text_chars)] <- 0
head(revDF.CTab)
##drop unneeded fields
revDF.CTab$prediction <- NULL
revDF.CTab$filtered_title <- NULL
revDF.CTab$target <- NULL
colnames(revDF.CTab)
revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA
revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF
## eliminate any articles in both:
### prepare bot filter
botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
botDF <- unique(botDF) #strip out any repetitions
#botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
head(revDF)
head(botDF)
botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure
revDF$editor_id <- as.character(revDF$editor_id)
head(revDF)
head(botDF)
revDF <- setDT(revDF)
botDF <- setDT(botDF)
revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list.
isABot.tab <- table(revDF$isBot)
## drop all bots here
revDF.clean <- subset(revDF, revDF$isBot==FALSE)
revDF <- NULL #so we don't use it accidentally
revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon))
##### filtering done, now to do some summing-up
### Weighting
##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum
#two criteria this meets:
#sum(weights) = total_revs
#sum(weights for given article) = sum(weights for all other articles)
numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count
numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses
n.revs <- length(revDF.clean$revid) ## total number of revisions
n.arts <- length(numEdits$encodedTitle) ## total number of articles
revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle")
revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle")
revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits
revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive
revDF.clean <- revDF.clean %>% mutate(got_reverted =
case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE))
table(revDF.clean$anon)
revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems
table(revDF.clean$anon)
##### dropping items with missing revids; if this happens, find out why
###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)]
artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize(
across(revid, length),
across(got_reverted, sum),
across(date_time, min)
)
titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source)
titleSampleDF <- unique(titleSampleDF)
artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from
artDF$min.birthday <- strptime(artDF$date_time, "%Y-%m-%d %H:%M:%S")
artDF$startOfRecords <- startOfRecords
artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later
artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs"))
## how many seconds old is each article inside the logged scope?
artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected?
artDF$pct.prot <- artDF.prot$pct.prot
print("saving full image")
save.image(paste0(dataPath, "dataset1.RData"), version=2)