diff --git a/R_examples/EDA.Rmd b/R_examples/EDA.Rmd index ec597ab..0fae152 100644 --- a/R_examples/EDA.Rmd +++ b/R_examples/EDA.Rmd @@ -196,18 +196,13 @@ This examines the same data but with a relative ranking approach for quality. Ea ```{r H3descriptives} ##revision quality artDF$pct_revert <- (artDF$got_reverted / artDF$revid) * 100 -table(artDF$taboo) +table(artDF$source) hist(artDF$got_reverted) hist(log(artDF$got_reverted)) hist(artDF$revid) hist(log(artDF$revid)) -## facet these hists by source!! -#cor.test(artDF$got_reverted, as.numeric(artDF$taboo)) -#cor.test(artDF$revid, as.numeric(artDF$taboo)) -#hist(log(artDF$got_reverted)) -#hist(log(artDF$revid)) -cor.test(artDF$pct_revert, as.numeric(artDF$taboo)) +cor.test(artDF$pct_revert, as.numeric(artDF$source)) cor.test(artDF$revid, artDF$pct_revert) #drop Barack Obama @@ -215,16 +210,16 @@ artDF <- subset(artDF, artDF$taboo!='0.5') table(artDF$taboo) artDF$taboo <- as.logical(artDF$taboo) -g <- ggplot(artDF, aes(x=revid, y=pct_revert, color=taboo)) + +g <- ggplot(artDF, aes(x=revid, y=pct_revert, color=source)) + geom_point() + labs(x='Number of Revisions', y='Percent Reverted') g -ggplot(artDF, aes(x=got_reverted, color=taboo)) + +ggplot(artDF, aes(x=got_reverted, color=source)) + geom_boxplot() -g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=got_reverted, color=taboo)) + +g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=got_reverted, color=source)) + geom_point(alpha=.2) + geom_smooth() + geom_rug(alpha=.2)+ @@ -234,7 +229,7 @@ g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=got_reverted, color=ta g -g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=pct_revert, color=taboo)) + +g <- ggplot(artDF, aes(group=as.factor(taboo), x=revid, y=pct_revert, color=source)) + geom_point(alpha=.2) + geom_smooth() + geom_rug(alpha=.2)+ @@ -281,7 +276,7 @@ xTabAnon <- xtabs(~source+revert+anon,data=revDF.clean.norev) ftable(xTabAnon) prop.table(ftable(xTabAnon), 1) -ggplot(revDF.clean.norev, aes(x=anon, color=taboo)) + +ggplot(revDF.clean.norev, aes(x=anon, color=source)) + geom_boxplot() ``` @@ -326,13 +321,13 @@ g ```{r} summary(revDF.clean$userpage_text_chars) -g <- ggplot(revDF.clean, aes(x=log1p(userpage_text_chars), group=taboo)) + +g <- ggplot(revDF.clean, aes(x=log1p(userpage_text_chars), group=source)) + geom_histogram(binwidth = .5) + facet_grid(source~., scales='free_y') g -g <- ggplot(subset(revDF.clean, revDF.clean$userpage_text_chars < exp(4)), aes(x=log1p(userpage_text_chars), group=taboo)) + +g <- ggplot(subset(revDF.clean, revDF.clean$userpage_text_chars < exp(4)), aes(x=log1p(userpage_text_chars), group=source)) + geom_histogram(binwidth = .5) + facet_grid(source~., scales='free_y') @@ -380,8 +375,6 @@ prop.table(table(gaveGenderDF$source, gaveGenderDF$gender), margin = 1) table(revDF.clean$emailable) -table(userDF$emailable) - prop.table(table(revDF.clean$source, revDF.clean$emailable), margin = 1) ``` @@ -390,19 +383,16 @@ prop.table(table(revDF.clean$source, revDF.clean$emailable), margin = 1) ```{r protection} -artDF.prot.only <- subset(artDF.prot, artDF.prot$pct.prot > 0) -artDF.prot.only <- subset(artDF.prot.only, artDF.prot.only$taboo != 0.5) - -g <- ggplot(artDF.prot.only, aes(x=pct.prot, group=taboo)) + +g <- ggplot(artDF, aes(x=pct.prot, group=source)) + geom_boxplot() + labs(x='Protection Proportion') g -t.artDF <- subset(artDF.prot.only, artDF.prot.only$pct.prot > 0) +artDF.prot <- subset(artDF, artDF$pct.prot > 0) -g <- ggplot(t.artDF, aes(x=pct.prot, group=taboo)) + +g <- ggplot(artDF.prot, aes(x=pct.prot, group=source)) + geom_boxplot() + labs(x='Protection Proportion (non-zero only)')