cdsc_examples_repository/R_examples/EDA.Rmd

---
title: "Taboo EDA"
author: "Kaylea Champion"
date: "2/3/2022"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library('ggplot2')
library('scales')
setwd("~/Research/cdsc_examples_repository/R_examples")
load('~/Research/cdsc_examples_repository/R_examples/data/EDA.RData')

reverselog_trans <- function(base = exp(1)) {
    trans <- function(x) -log(x, base)
    inv <- function(x) base^(-x)
    trans_new(paste0("reverselog-", format(base)), trans, inv,
              log_breaks(base = base),
              domain = c(1e-100, Inf))
}

options(scipen=999)
```

## Overview

The purpose of this study is to understand the production and development of taboo topics.

### Independent Variables
- taboo (article)

### Dependent Variables
- article readership (h1)
- article quality (h2)
- contribution quality (h3)
- contributor identifiability (h4)

### Important Controls

- article age (quality tends to grow over time as people have the opportunity to contribute)
- time (general traffic trends wax and wane)

### Relationships to be Tested

- H1 - taboo topics have higher readership than random articles
- H2 - taboo topics are lower quality than random articles
- H3 - taboo topics receive lower quality contributions than random articles
- H4 - taboo topics are more likely to receive contributions from less identifiable contributors than random articles

### Unit(s) of Analysis
 - The article (h1, h2)
 - The revision of the article (h3, h4)

## H1 Descriptives: Viewership

```{r, H1descriptives,  echo=FALSE,results='hide',fig.keep='all'}
##article readership -- vDF

hist(vDF$viewSum)
hist(log(vDF$viewSum))
```

We don't use sum, however -- we use view rank. View Rank behaves reasonably well if logged:

```{r}


hist(vDF$viewRankInMyMonth)
hist(log(vDF$viewRankInMyMonth))

## still a model w/time - loess
g <- ggplot(vDF, aes(x=date, y=viewRankInMyMonth, color=source)) +
  geom_smooth() +
  #scale_y_continuous(trans='reverse') +
  theme_bw() +
  scale_y_continuous(trans=reverselog_trans(10),
                     breaks = trans_breaks("log10", function(x) 10^x),
                     labels = trans_format("log10", math_format(10^.x))) +
  scale_x_date() +
  scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  labs(x="Time", y="View Rank")

g

```

```{r H2descriptivesp1}

## absolute quality
g <- ggplot(qDF, aes(x=date, y=weighted_sum, group=encodedTitle, color=source)) +
  #geom_smooth() +
  #scale_y_continuous(trans='reverse') +
  ## change this to show trajectories
  #geom_line(alpha=.05) + #angry R!!
  #geom_point(alpha=.05) +
  theme_bw() +
  geom_rug(alpha=.1) +
#  scale_y_continuous(trans=reverselog_trans(10),
#                     breaks = trans_breaks("log10", function(x) 10^x),
#                     labels = trans_format("log10", math_format(10^.x))) +
  scale_x_date() +
  scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  labs(x="Time", y="Quality")

g

## facet grid
g <- ggplot(qDF, aes(x=source, y=weighted_sum)) +
  geom_boxplot()

g

table(qDF$source)

g <- ggplot(qDF, aes(x=weighted_sum)) +
  geom_histogram() +
  facet_grid(source~., scales='free_y')

g

```

This graph shows that taboo articles originated much earlier than those obtained with a random sample approach.

Another key variable is quality --- which again is ranked and logged.

```{r H2descriptivesp2}
## relative quality

hist(qDF$weighted_sum)
```

```{r}
hist(log(qDF$weighted_sum))
```

```{r}
hist(qDF$qualityRankInMyMonth)
```

```{r}
hist(log(qDF$qualityRankInMyMonth))
```

```{r}

qt <- subset(qDF, qDF$source=='taboo')
qr <- subset(qDF, qDF$source=='ngram')

mean(qDF$weighted_sum)
mean(qr$weighted_sum)
mean(qt$weighted_sum)
mean(qr$qualityRankInMyMonth)
mean(qt$qualityRankInMyMonth)

g <- ggplot(qDF, aes(x=qualityRankInMyMonth, color=source)) +
  geom_boxplot()

g

g <- ggplot(qDF, aes(x=weighted_sum, color=source)) +
  geom_boxplot()

g

g <- ggplot(qDF, aes(x=date, y=qualityRankInMyMonth, color=source)) +
  geom_smooth() +
  #scale_y_continuous(trans='reverse') +
  theme_bw() +
  geom_rug(alpha=.1) +
  #scale_y_continuous(trans=reverselog_trans(10),
  #                   breaks = trans_breaks("log10", function(x) 10^x),
  #                   labels = trans_format("log10", math_format(10^.x))) +
  scale_x_date() +
  #scale_color_discrete(name="Source", labels=c("Random", "Taboo")) +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  labs(x="Time", y="Quality Rank")

g

summary(qt$qualityRankInMyMonth)
summary(qr$qualityRankInMyMonth)
summary(vDF$viewRankInMyMonth)
summary(qDF$qualityRankInMyMonth)


```

This examines the same data but with a relative ranking approach for quality. Each article's relative rank is in terms of quality that month, compared to all other articles that were present in Wikipedia at that time. Lower numbers are better (at the top of the graph).

## H3 - Contribution Quality

```{r H3descriptives}
##revision quality
artDF$pct_revert <- (artDF$got_reverted / artDF$revid) * 100
table(artDF$source)
hist(artDF$got_reverted)
hist(log(artDF$got_reverted))
hist(artDF$revid)
hist(log(artDF$revid))

cor.test(artDF$pct_revert, as.numeric(artDF$source))
cor.test(artDF$revid, artDF$pct_revert)

#drop Barack Obama
artDF <- subset(artDF, artDF$taboo!='0.5')
table(artDF$taboo)
artDF$taboo <- as.logical(artDF$taboo)

g <- ggplot(artDF, aes(x=revid, y=pct_revert, color=source)) +
  geom_point() +
  labs(x='Number of Revisions', y='Percent Reverted')

g

ggplot(artDF, aes(x=got_reverted, color=source)) +
  geom_boxplot()

g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=got_reverted, color=source)) +
  geom_point(alpha=.2) +
  geom_smooth() +
  geom_rug(alpha=.2)+
  scale_x_continuous(trans='log') +
  scale_y_continuous(trans='log') +
  labs(x='Number of Revisions', y='Number Reverted')

g

g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=pct_revert, color=source)) +
  geom_point(alpha=.2) +
  geom_smooth() +
  geom_rug(alpha=.2)+
  scale_x_continuous() +
  scale_y_continuous() +
  labs(x='Number of Revisions', y='Percent Reverted')

g

## pull edits that were later reverted, since they therefore didn't really contribute per se
revDF.clean.norev <- subset(revDF.clean, revDF.clean$got_reverted==FALSE)
revDF.clean.norev <- subset(revDF.clean.norev, revDF.clean.norev$revert != '')
table(revDF.clean.norev$anon)

print('proportion of reverts by source')
cleanTable <- table(revDF.clean.norev$source, revDF.clean.norev$revert)
ftable(cleanTable)
prop.table(cleanTable, 1)
prop.table(cleanTable, 2)

```

These results preliminarily suggest a log-log linear model. However, we observe that we do not have support in both samples across the full range of the data. The random sample contains a large number of zeroes.

## H4: Anonymity

H4 tests three limits on identifiability:
* contributing without an account
* contributing with a new account (or, being a relative newcomer)
* having a short user page.

We want to make sure we have a reasonable amount of variation and handle zeroes correctly for these measures.
### Contributing without an account

Proportion of Edits that are made without an account:
```{r}

anonTable <- table(revDF.clean.norev$source, revDF.clean.norev$revert, revDF.clean.norev$anon)
anonTable
ftable(anonTable)
prop.table(ftable(anonTable),1)

xTabAnon <- xtabs(~source+revert+anon,data=revDF.clean.norev)
ftable(xTabAnon)
prop.table(ftable(xTabAnon), 1)

ggplot(revDF.clean.norev, aes(x=anon, color=source)) +
  geom_boxplot()
```


### Contributing with a new account
```{r H4descriptives}

g <- ggplot(revDF.clean.norev, aes(x=editor_nth_edit_nocollapse)) +
  geom_histogram() +
  facet_grid(source~., scales='free_y')

g

g <- ggplot(revDF.clean.norev, aes(x=log(editor_nth_edit_nocollapse))) +
  geom_histogram() +
  facet_grid(source~., scales='free_y')

g

g <- ggplot(revDF.clean.norev, aes(x=editor_nth_edit_nocollapse)) +
  geom_boxplot() +
  facet_grid(source~., scales='free_y')

g

t <- subset(revDF.clean.norev, as.logical(revDF.clean.norev$anon)==TRUE)
mean(t$editor_nth_edit_nocollapse)

t <- subset(revDF.clean.norev, as.logical(revDF.clean.norev$anon)==FALSE)
mean(t$editor_nth_edit_nocollapse)

g <- ggplot(revDF.clean, aes(x=log(editor_nth_edit_nocollapse))) +
  geom_boxplot() +
  facet_grid(source~., scales='free_y')

g

```

### Having a short user page

```{r}
summary(revDF.clean$userpage_text_chars)

g <- ggplot(revDF.clean, aes(x=log1p(userpage_text_chars), group=source)) +
  geom_histogram(binwidth = .5) +
  facet_grid(source~., scales='free_y')

g

g <- ggplot(subset(revDF.clean, revDF.clean$userpage_text_chars < exp(4)), aes(x=log1p(userpage_text_chars), group=source)) +
  geom_histogram(binwidth = .5) +
  facet_grid(source~., scales='free_y')

g


# look at last edit, what's the 'final' quality?
library(dplyr)
finalQDF <- qDF %>% group_by(encodedTitle) %>% filter(timestamp == max(timestamp))

g <- ggplot(finalQDF, aes(x=weighted_sum, y=source)) +
  geom_boxplot() +
  labs(x='Final Quality')

g

summary(artDF$got_reverted/artDF$revid)
```


### Gender

I used the API to find out a bit of user information -- are they emailable, whether gender was specified, and if it was specified, whether the person was female.

```{r gender user data}

table(revDF.clean$gender)
userDF <- data.frame('editor'=revDF.clean$editor, 'gender' = revDF.clean$gender, 'emailable' = revDF.clean$emailable, 'anon'=revDF.clean$anon)

userDF <- subset(userDF, userDF$anon == 'false')
userDF <- unique(userDF)
table(userDF$gender)

prop.table(table(revDF.clean$source, revDF.clean$gender), margin = 1)
gaveGenderDF <- subset(revDF.clean, revDF.clean$gender != 'unknown')
prop.table(table(gaveGenderDF$source, gaveGenderDF$gender), margin = 1)

```


### Being Emailable


```{r emailable user data}

table(revDF.clean$emailable)

prop.table(table(revDF.clean$source, revDF.clean$emailable), margin = 1)

```

### Protections

```{r protection}


g <- ggplot(artDF, aes(x=pct.prot, group=source)) +
  geom_boxplot() +
  labs(x='Protection Proportion')

g

artDF.prot <- subset(artDF, artDF$pct.prot > 0)

g <- ggplot(artDF.prot, aes(x=pct.prot, group=source)) +
  geom_boxplot() +
  labs(x='Protection Proportion (non-zero only)')

g


```