1
0

Initial commit

p#	new file:   runwikiq.sh
This commit is contained in:
2018-06-02 15:32:19 -07:00
commit 72633c193b
202 changed files with 21929 additions and 0 deletions

162
02_model_newcomer_survival.R Executable file
View File

@@ -0,0 +1,162 @@
#!/usr/bin/env Rscript
# Fits newcomer retention models
# Copyright (C) 2018 Nathan TeBlunthuis
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
library(scales)
if(!exists("newcomers")){
source("01_build_datasets.R")
}
use.sample <- FALSE
if(use.sample == TRUE){
source("lib-01-sample-datasets.R")
newcomer.ds <- sample.newcomers()
}else{
newcomer.ds <- newcomers
}
library("optimx")
library("lme4")
newcomer.ds <- newcomer.ds[,wiki:=as.factor(wiki.name)]
newcomer.ds <- newcomer.ds[,":="(
wiki.age.log = log1p(as.double(wiki.age,units='weeks')),
is.bot.reverted = ifelse(is.na(is.bot.reverted),FALSE,is.bot.reverted),
is.admin.reverted = ifelse(is.na(is.admin.reverted),FALSE,is.admin.reverted),
year = as.factor(year(time.first.edit)),
month = as.factor(paste0(year(time.first.edit),month(time.first.edit))),
ns0.edits.log = log1p(ns0.edits),
ns1.edits.log = log1p(ns1.edits),
ns4.edits.log = log1p(ns4.edits),
n.other.wikis.log = log1p(n.other.wikis),
n.edits.other.log = log1p(n.edits.other),
n.messages.log = log1p(n.messages),
n.editors.log = log1p(n.editors),
total.wiki.length.log = log1p(total.wiki.length),
n.ns4.edits.log = log1p(n.ns4.edits),
n.ns4.editors.log = log1p(n.ns4.editors),
ns4.editor.age.log = log1p(as.double(ns4.editor.age,units='years')),
d.ns4.length.scaled = scale(d.ns4.length),
newcomer.chars.changed.scaled = scale(newcomer.chars.change),
session.edits.log = log1p(session.edits),
wiki.age = as.double(wiki.age,units='years')
)]
## record summary stats for our analytic variables
newcomer.summary.stats <- list()
newcomer.summary.stats$p.survives <- mean(newcomer.ds$survives)
newcomer.summary.stats$var.survives <- var(newcomer.ds$survives)
outliers <- newcomer.ds[session.edits >= 100]
newcomer.summary.stats$N.outliers <- nrow(outliers)
newcomer.summary.stats$p.first.session.no.outliers <- mean(newcomer.ds[session.edits < 100]$session.edits)
newcomer.summary.stats$var.first.session.no.outliers <- var(newcomer.ds[session.edits < 100]$session.edits)
newcomer.summary.stats$p.reverted <- mean(newcomer.ds$is.reverted)
newcomer.summary.stats$var.reverted <- var(newcomer.ds$is.reverted)
newcomer.summary.stats$p.messaged <- mean(newcomer.ds$is.messaged)
newcomer.summary.stats$var.messaged <- var(newcomer.ds$is.messaged)
newcomer.summary.stats$mean.first.session.edits <- mean(newcomer.ds$session.edits)
newcomer.summary.stats$var.first.session.edits <- var(newcomer.ds$session.edits)
newcomer.summary.stats$med.first.session.edits <- median(newcomer.ds$session.edits)
newcomer.summary.stats$p.bot.reverted <- mean(newcomer.ds$is.bot.reverted)
newcomer.summary.stats$var.bot.reverted <- var(newcomer.ds$is.bot.reverted)
remember(newcomer.summary.stats)
halfak.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki.age + quarter + wiki.name")
newcomer.ds.all <- newcomer.ds
newcomer.ds <- newcomer.ds[n.other.wikis==0]
print('fitting halfak model on all newcomers')
halfak.mod.all.newcomers <- glm(halfak.formula,data=newcomer.ds.all,family=binomial(link=logit))
saveRDS(halfak.mod.all.newcomers,"halfak.mod.all.newcomers.RDS")
remember(extract(halfak.mod.all.newcomers),"halfak.model.all.newcomers",silent=TRUE)
print("fitting halfak model")
halfak.mod <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit))
saveRDS(halfak.mod,"halfak.mod.RDS")
remember(extract(halfak.mod),"halfak.model",silent=TRUE)
print('fitting halfak model with weights')
n.total.wikis <- length(unique(newcomer.ds$wiki.name))
weight.per.wiki <- nrow(newcomer.ds)/n.total.wikis
newcomer.ds <- newcomer.ds[,weights:=weight.per.wiki/.N,by=wiki.name]
halfak.mod.weighted <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit),weights=newcomer.ds$weights)
saveRDS(halfak.mod.weighted,"halfak.mod.weighted.RDS")
remember(extract(halfak.mod.weighted),"halfak.model.weighted",silent=TRUE)
## print('fit halfak model on a sample')
## sample.size <- 30
## newcomer.ds <- newcomer.ds[,in.sample:=.N >= sample.size, by=wiki.name]
## newcomer.ds.sample <- newcomer.ds[,.SD[sample(.N,min(sample.size,.N))],by=wiki.name]
## halfak.mod.sample <- glm(halfak.formula,data=newcomer.ds.sample,family=binomial(link=logit))
## saveRDS(halfak.mod.sample,"halfak.mod.sample.RDS")
## remember(extract(halfak.mod.sample),"halfak.model.sample",silent=TRUE)
print('fitting RE model')
library("optimx")
print('fitting re model')
re.icc.survives.model <- glmer(as.formula("survives ~ + (1 | wiki) - 1"),data=newcomer.ds,family=binomial(link=logit))
saveRDS(re.icc.survives.model,"re.icc.survives.model.RDS")
varcorrmat <- as.data.table(VarCorr(re.icc.survives.model))
wiki.var <- varcorrmat[grp=='wiki' & var1=="(Intercept)" ,vcov]
group.var <- var(residuals(re.icc.survives.model))
icc <- wiki.var/(group.var + wiki.var)
remember(varcorrmat,'icc.survives.varcormat')
remember(group.var,'icc.survives.group.var')
remember(icc,'icc.survives')
## newcomer.no.pooling.f <- as.formula("survives ~ is.reverted:wiki.name + is.messaged:wiki.name + is.bot.reverted:wiki.name + session.edits.log:wiki.name + wiki.name + quarter:wiki.name + wiki.name:wiki.age - 1")
## newcomer.no.pooling.mod <- glm(newcomer.no.pooling.f,gdata=newcomer.ds,family=binomial(link=logit))
## remember(extract(newcomer.no.pooling.mod),"newcomer.no.pooling.mod",silent=TRUE)
## if( !(exists("halfak.robustnes1.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){
## halfak.robustness1.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki + quarter + wiki:wiki.age")
## print("fitting halfak robustness 1 model")
## newcomer.robustness.ds <- newcomer.ds[p.reverted <= 0.05]
## halfak.robustness1.mod <- glm(halfak.robustness1.formula,data=newcomer.robustness.ds,family=binomial(link=logit))
## saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS")
## remember(extract(halfak.robustness1.mod),"halfak.robustness1.model")
## }
## else if(file.exists("halfak.robustness1.mod.RDS") & !exists("halfak.robustness1.mod")){
## newcomer.no.pooling.mod <- readRDS("halfak.robustness1.mod.RDS")
## }
## else if (exists("halfak.robustness1.mod")){
## saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS")
## }
## remember(extract(halfak.robustness1.mod),"halfak.robustness1.mod")
## }
## if( !(exists("halfak.robustnes2.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){
## halfak.robustness2.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits .log + wiki + quarter + wiki:wiki.age")
## print("fitting halfak robustness 2 model")
## newcomer.robustness.ds2 <- newcomer.ds[p.reverted <= 0.5]
## halfak.robustness2.mod <- glm(halfak.robustness2.formula,data=newcomer.robustness.ds2,family=binomial(link=logit))
## saveRDS(halfak.robustness1.mod,"halfak.robustness2.mod.RDS")
## remember(extract(halfak.robustness1.mod),"halfak.robustness2.model")
## }
## else if(file.exists("halfak.robustness2.mod.RDS") & !exists("halfak.robustness2.mod")){
## halfak.robustness2.mod <- readRDS("halfak.robustness2.mod.RDS")
## }
## else if (exists("halfak.robustness2.mod")){
## saveRDS(halfak.robustness2.mod,"halfak.robustness2.mod.RDS")
## }
## remember(extract(halfak.robustness2.mod),"halfak.robustness2.mod")
## }