Initial commit
p# new file: runwikiq.sh
This commit is contained in:
845
lib-01-build_newcomer_table.R
Normal file
845
lib-01-build_newcomer_table.R
Normal file
@@ -0,0 +1,845 @@
|
||||
# Library containing code for processing wikiq tsvs into datasets
|
||||
# Copyright (C) 2018 Nathan TeBlunthuis
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
library(urltools)
|
||||
library(lubridate)
|
||||
### is it more efficient to develop inside the loop or outside?
|
||||
## with group by outside mclapply
|
||||
## user system elapsed
|
||||
## 3.743 8.112 6.219
|
||||
|
||||
## user system elapsed
|
||||
## 609.715 592.603 638.172
|
||||
|
||||
## with group by inside mclapply
|
||||
## user system elapsed
|
||||
## 3.670 8.302 5.780
|
||||
|
||||
## user system elapsed
|
||||
## 739.826 408.396 596.346
|
||||
## conclusion: do as much outside mclapply as possible
|
||||
|
||||
build.newcomer.table.step1 <- function(wiki.list,
|
||||
session.window.length = duration(1,units="hours"),
|
||||
newcomer.period = duration(2*30,units="days"),
|
||||
newcomer.sunset = duration(180,units="days"),
|
||||
n.early.period.sessions = 1){
|
||||
d.list <- mclapply(1:nrow(wiki.list),load.wikiq.files,wiki.list=wiki.list,mc.preschedule=F)
|
||||
# d.list <- lapply(1:nrow(wiki.list),wiki.list=wiki.list,load.wikiq.files)
|
||||
all.edits <- rbindlist(d.list)
|
||||
|
||||
all.edits[,
|
||||
":="(time.first.edit = min(date.time),
|
||||
time.last.edit = max(date.time)),
|
||||
by=.(editor.id, wiki.name)]
|
||||
|
||||
|
||||
all.edits[,
|
||||
":="(editor=gsub("\"","",editor),
|
||||
title=gsub("\"","",title),
|
||||
reverteds=gsub("\"","",reverteds))]
|
||||
|
||||
all.edits <- all.edits[editor != "Default"]
|
||||
all.edits[,month:=floor_date(date.time,unit="month")]
|
||||
all.edits[,,by=.(wiki.name,editor)]
|
||||
setkey(all.edits,wiki.name,editor.id,date.time)
|
||||
## fix the definition of session to edits that have less than 1 hour together
|
||||
all.edits[,":="(time.since.last.edit = diff(c(first(time.first.edit),date.time),lag=1,differences=1),
|
||||
time.till.next.edit = diff(c(date.time,last(time.last.edit))),lag=1,differences=1,
|
||||
editor.tenure =as.duration(max(date.time)-min(date.time))),
|
||||
by=.(editor.id,wiki.name)]
|
||||
|
||||
all.edits[,":="(new.session = time.since.last.edit > session.window.length),by=.(editor.id,wiki.name)]
|
||||
all.edits[,":="(nth.session = cumsum(new.session)),by=.(editor.id,wiki.name)]
|
||||
all.edits[,":="(in.early.session = nth.session < n.early.period.sessions)]
|
||||
|
||||
all.edits[,
|
||||
":="(is.reverted = any(reverted),
|
||||
is.deleted = any(deleted),
|
||||
p.reverted = mean(reverted & namespace ==0),
|
||||
n.first.session=nrow(.SD[in.early.session==TRUE])),
|
||||
by=.(editor.id,wiki.name)]
|
||||
all.edits[,":="(age = as.duration(date.time - time.first.edit))]
|
||||
|
||||
all.edits[,":="(last.wiki.edit = max(date.time)),by=.(wiki.name)]
|
||||
all.edits[,":="(is.newcomer = (age < newcomer.period) & (as.duration(last.wiki.edit - time.first.edit) > as.duration(newcomer.sunset)) & !anon)]
|
||||
|
||||
## did rejecting editors leave a comment on the talk page?
|
||||
return(all.edits)
|
||||
}
|
||||
|
||||
add.userroles <- function(all.edits,bots,admins){
|
||||
|
||||
bots[,":="(wiki.name = wiki,
|
||||
editor = user
|
||||
),
|
||||
by=.(wiki,user)
|
||||
]
|
||||
|
||||
admins[,":="(wiki.name = wiki,
|
||||
editor = user),
|
||||
by=.(wiki,user)]
|
||||
|
||||
all.edits[bots,
|
||||
":="(
|
||||
is.bot = i.is.bot
|
||||
),
|
||||
on=.(wiki.name,
|
||||
editor,
|
||||
date.time >= role.period.begin,
|
||||
date.time <= role.period.end)
|
||||
]
|
||||
|
||||
all.edits[admins,
|
||||
":="(
|
||||
is.admin = i.is.admin
|
||||
),
|
||||
on=.(wiki.name,
|
||||
editor,
|
||||
date.time >= role.period.begin,
|
||||
date.time <= role.period.end)
|
||||
]
|
||||
|
||||
all.edits[,":="(is.bot = ifelse(is.na(is.bot),FALSE,is.bot),
|
||||
is.admin = ifelse(is.na(is.admin),FALSE,is.admin))]
|
||||
|
||||
all.edits[,":="(is.newcomer = (is.newcomer & !is.bot))]
|
||||
return(all.edits)
|
||||
}
|
||||
|
||||
identify.revert.messages <- function(all.edits, discussion.window = as.difftime(7,units="days"),week.length=as.difftime(7,units="days")){
|
||||
|
||||
all.edits[,user.talk:=as.factor(paste0("User talk:",as.character(all.edits$editor)))]
|
||||
|
||||
## join the talk page edits wit
|
||||
all.edits[namespace==0,talk:=as.factor(paste0("Talk:",as.character(all.edits[namespace==0]$title)))]
|
||||
|
||||
print(" identifying reverts")
|
||||
all.edits[!is.na(reverteds),reverted.edits := lapply(strsplit(reverteds,","),strtoi)]
|
||||
|
||||
all.edits[!is.na(reverteds),N.reverteds := lapply(reverted.edits,length)]
|
||||
|
||||
ns.edits = all.edits[namespace==0 | namespace==4]
|
||||
|
||||
reverted.lookup <- ns.edits[!is.na(reverteds),
|
||||
.(revid = unlist(reverted.edits),
|
||||
wiki.name = rep(wiki.name,N.reverteds),
|
||||
reverted.by = rep(editor,N.reverteds),
|
||||
reverted.by.bot = rep(is.bot, N.reverteds),
|
||||
reverted.by.admin = rep(is.admin, N.reverteds),
|
||||
revert.date.time = rep(date.time,N.reverteds),
|
||||
revert.id = rep(revid,N.reverteds))]
|
||||
|
||||
reverted.edits <- ns.edits[reverted==TRUE]
|
||||
|
||||
reverted.edits[reverted.lookup,
|
||||
":="(reverted.by = i.reverted.by,
|
||||
reverted.by.bot = i.reverted.by.bot,
|
||||
reverted.by.admin = i.reverted.by.admin,
|
||||
revert.date.time = i.revert.date.time,
|
||||
revert.id = revert.id),
|
||||
on=.(wiki.name,revid)]
|
||||
|
||||
reverted.edits[,message.window.end:= revert.date.time + discussion.window]
|
||||
|
||||
## merge back revert info to all.edits
|
||||
all.edits[reverted.edits,":="(
|
||||
reverted.by = i.reverted.by,
|
||||
reverted.by.bot = i.reverted.by.bot,
|
||||
reverted.by.admin = i.reverted.by.admin,
|
||||
revert.date.time = i.revert.date.time,
|
||||
revert.id = revert.id,
|
||||
message.window.end = message.window.end),
|
||||
on = .(wiki.name, revid)]
|
||||
|
||||
print(" done")
|
||||
print(" identifying editor talk page edits")
|
||||
ns0.edits = all.edits[namespace==0]
|
||||
|
||||
## we want talkers who talk before the end of the window
|
||||
talk.page.edits = all.edits[namespace==1]
|
||||
talk.page.edits[,talk:=title]
|
||||
|
||||
|
||||
## we only need to keep the key identifier for each revert
|
||||
## use editor + title instead of revid since editors may have more than
|
||||
## one edit reverted by a given revert.id.
|
||||
## key = wiki.name,editor,title,revert.id,
|
||||
|
||||
setkeyv(reverted.edits,c("wiki.name","editor","title","revert.id"))
|
||||
## condition where editor discusses after being reverted
|
||||
editor.talks <- reverted.edits[talk.page.edits,
|
||||
.(
|
||||
wiki.name,
|
||||
editor = x.editor,
|
||||
revert.id = x.revert.id,
|
||||
talk.id = i.revid,
|
||||
talk.date.time=i.date.time
|
||||
)
|
||||
,on=.(editor,
|
||||
wiki.name,
|
||||
talk,
|
||||
revert.date.time<date.time,
|
||||
message.window.end>=date.time)
|
||||
,nomatch=0L]
|
||||
|
||||
editor.talks <- editor.talks[,
|
||||
.(
|
||||
editor.talks = TRUE,
|
||||
time.editor.talks = min(talk.date.time),
|
||||
editor.talks.revid = min(talk.id)
|
||||
),
|
||||
by = .(wiki.name,editor,revert.id)
|
||||
]
|
||||
|
||||
## merge back reverted edits to all.edits
|
||||
all.edits[editor.talks,
|
||||
":="(editor.talks = editor.talks,
|
||||
time.editor.talks = time.editor.talks,
|
||||
editor.talks.revid=editor.talks.revid),
|
||||
on=.(wiki.name,editor,revert.id)]
|
||||
|
||||
## tidy up
|
||||
rm(editor.talks, reverted.lookup)
|
||||
|
||||
print(" done")
|
||||
print(" identifying reverter talk page edits")
|
||||
all.edits[,":="(response.window.end = time.editor.talks + discussion.window)]
|
||||
all.edits[(reverted==TRUE & is.na(editor.talks)), editor.talks := FALSE]
|
||||
ns0.edits = all.edits[namespace==0]
|
||||
reverted.edits <- ns0.edits[reverted==TRUE]
|
||||
talk.page.edits <- all.edits[namespace==1]
|
||||
talk.page.edits[,":="(talk = title,reverted.by=editor)]
|
||||
|
||||
# the key is still wiki.name, editor, revert.id
|
||||
reverter.talks <- reverted.edits[talk.page.edits,
|
||||
.(
|
||||
wiki.name = wiki.name,
|
||||
editor = x.editor,
|
||||
revert.id = x.revert.id,
|
||||
revert.date.time = x.revert.date.time,
|
||||
time.reverter.talks = i.date.time,
|
||||
reverter.talk.id = i.revid
|
||||
),
|
||||
,on=.(reverted.by,
|
||||
wiki.name,
|
||||
talk,
|
||||
|
||||
revert.date.time<date.time,
|
||||
response.window.end>=date.time),
|
||||
nomatch=0L]
|
||||
|
||||
reverter.talks <- reverter.talks[time.reverter.talks > revert.date.time,
|
||||
.(
|
||||
reverter.talks = TRUE,
|
||||
time.reverter.talks = min(time.reverter.talks),
|
||||
reverter.talk.id = min(reverter.talk.id)
|
||||
),
|
||||
by=.(wiki.name,editor,revert.id)
|
||||
]
|
||||
|
||||
|
||||
## merge back reverted.edits to all.edits
|
||||
all.edits[reverter.talks,
|
||||
":="(reverter.talks = reverter.talks,
|
||||
time.reverter.talks = time.reverter.talks,
|
||||
reverter.talk.id = reverter.talk.id),
|
||||
on=.(wiki.name,editor,revert.id)]
|
||||
|
||||
## tidy up
|
||||
rm(reverter.talks,talk.page.edits)
|
||||
|
||||
all.edits[(reverted == TRUE) & (is.na(reverter.talks)), reverter.talks := FALSE]
|
||||
|
||||
# if the editor didn't talk first, the time window is different
|
||||
all.edits[reverter.talks == TRUE,
|
||||
editor.talks.first := (time.editor.talks < time.reverter.talks)]
|
||||
|
||||
all.edits[(reverter.talks == TRUE) & (editor.talks.first==FALSE),
|
||||
reverter.talks := time.reverter.talks < (date.time + discussion.window)]
|
||||
|
||||
print(" done")
|
||||
print(" identifying User talk page edits")
|
||||
|
||||
## now do the same thing but for user talk pages
|
||||
## did the reverter post on the editor's user talk page?
|
||||
## key is wiki.name, title, reverted.by, revert.id
|
||||
ns0.edits = all.edits[namespace==0]
|
||||
user.talk.edits = all.edits[namespace==3]
|
||||
user.talk.edits[,":="(reverted.by=editor,user.talk=title)]
|
||||
reverted.edits = ns0.edits[reverted==TRUE]
|
||||
reverter.messages = reverted.edits[user.talk.edits,
|
||||
.(wiki.name = x.wiki.name,
|
||||
title = x.title,
|
||||
revert.id = x.revert.id,
|
||||
editor = x.editor,
|
||||
reverted.by = i.reverted.by,
|
||||
time.reverter.messages=i.date.time,
|
||||
reverter.messages.id=i.revid),
|
||||
on=.(wiki.name,
|
||||
reverted.by,
|
||||
user.talk,
|
||||
revert.date.time <= date.time,
|
||||
message.window.end >= date.time
|
||||
),
|
||||
nomatch=0L]
|
||||
|
||||
reverter.messages = reverter.messages[,.(reverter.messages = TRUE,
|
||||
time.reverter.messages = min(time.reverter.messages),
|
||||
reverter.message.id = min(reverter.messages.id)),
|
||||
by=.(wiki.name, editor, reverted.by, revert.id)]
|
||||
|
||||
reverted.edits[reverter.messages,":="(reverter.messages = reverter.messages,
|
||||
time.reverter.messages = time.reverter.messages,
|
||||
reverter.message.id = reverter.message.id),
|
||||
on=.(wiki.name, editor, revert.id)]
|
||||
|
||||
reverted.edits[is.na(reverter.messages), reverter.messages := FALSE]
|
||||
|
||||
all.edits[reverted.edits,":="(reverter.messages = reverter.messages,
|
||||
time.reverter.messages = time.reverter.messages,
|
||||
reverter.message.id = reverter.message.id),
|
||||
on=.(wiki.name, editor, revert.id)]
|
||||
|
||||
## set some wiki-level variables
|
||||
print(" creating wiki windows")
|
||||
setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
|
||||
all.edits[,":="(chars.change = diff(c(0L,text.chars),lag=1,differences=1),
|
||||
creates.article = (date.time == min(date.time))
|
||||
),by=.(wiki.name,articleid)]
|
||||
|
||||
setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
|
||||
|
||||
# Some wikis got created by Wikia - invalidating wiki age that doesn't remove this editor
|
||||
|
||||
all.edits[,":="(wiki.birth.date = min(date.time)),by=.(wiki.name)]
|
||||
|
||||
all.edits[,":="(total.wiki.length = cumsum(chars.change),
|
||||
n.articles = cumsum(creates.article),
|
||||
wiki.age = as.duration(date.time - wiki.birth.date),
|
||||
year = year(date.time)
|
||||
),by=.(wiki.name)]
|
||||
|
||||
all.edits[,":="(wiki.age.months = floor(as.double(wiki.age,units='days')/30),
|
||||
wiki.age.years = floor(as.double(wiki.age,units='years')))]
|
||||
|
||||
## generate breaks at precisely 1 week +/- the first edit.
|
||||
date.range <- all.edits[,.(first.edit = min(date.time),last.edit = max(date.time)), by = .(wiki.name)]
|
||||
|
||||
window.breaks <- date.range[,.(breaks = seq(trunc(first.edit,"days"),
|
||||
trunc(last.edit,"days"),
|
||||
by=week.length),
|
||||
break.next = seq(trunc(first.edit+week.length,"days"),
|
||||
trunc(last.edit+week.length,"days"),
|
||||
by=week.length)),
|
||||
by=.(wiki.name)]
|
||||
|
||||
window.breaks[,
|
||||
":="(i.break = 1:length(breaks))
|
||||
,by=(wiki.name)]
|
||||
|
||||
all.edits[window.breaks,
|
||||
":="(week = i.break
|
||||
),
|
||||
on=.(wiki.name, date.time <=break.next,date.time >=breaks)]
|
||||
|
||||
print(" done")
|
||||
## tidy up
|
||||
all.edits[,":="(reverted.edits = NULL,
|
||||
N.reverteds = NULL,
|
||||
user = NULL,
|
||||
user.talk = NULL,
|
||||
talk=NULL,
|
||||
message.window.end=NULL,
|
||||
response.window.end=NULL)]
|
||||
|
||||
print(" done")
|
||||
rm(reverted.edits,reverter.messages,user.talk.edits,ns0.edits)
|
||||
return(all.edits)
|
||||
}
|
||||
|
||||
build.newcomers <- function(all.edits,
|
||||
newcomer.period = duration(60,unit="days"),
|
||||
newcomer.sunset= duration(30*6,unit="days")
|
||||
){
|
||||
setkeyv(all.edits,'date.time')
|
||||
|
||||
all.edits[,":="(time.last.edit.to.wiki = max(date.time)), by=.(wiki.name)]
|
||||
|
||||
all.edits <- all.edits[,time.till.page.edit := c(diff(date.time),as.numeric(NA)),by=.(wiki.name,articleid)]
|
||||
all.edits <- all.edits[,last.edit.to.page :=is.na(time.till.page.edit)]
|
||||
|
||||
all.edits[last.edit.to.page == TRUE,time.till.page.edit := time.last.edit.to.wiki-date.time]
|
||||
|
||||
all.edits <- all.edits[,time.till.page.edit := log1p(as.numeric(time.till.page.edit,units='days'))]
|
||||
|
||||
editor.variables <- all.edits[,
|
||||
.(survives = any( (age > newcomer.period) & (age < newcomer.sunset)),anon=first(anon),is.bot=any(is.bot),is.admin=any(is.admin)),
|
||||
by = .(wiki.name,editor)
|
||||
]
|
||||
|
||||
first.session.edits <- all.edits[in.early.session==TRUE]
|
||||
first.session.edits[,":="(end.newcomer.period = time.first.edit + newcomer.period)]
|
||||
|
||||
print(" aggregating newcomer activity within wikis")
|
||||
newcomers <- first.session.edits[namespace == 0,
|
||||
.(
|
||||
is.reverted = any(reverted & reverted.by != editor),
|
||||
p.reverted = first(p.reverted),
|
||||
is.bot.reverted = any(reverted.by.bot),
|
||||
is.admin.reverted = any(reverted.by.admin),
|
||||
is.reverted.messaged = any(reverter.messages |
|
||||
reverter.talks,na.rm=TRUE),
|
||||
reverter.talks = any(reverter.talks, na.rm=TRUE),
|
||||
reverter.messages = any(reverter.messages, na.rm=TRUE),
|
||||
editor.talks = any(editor.talks,na.rm=TRUE),
|
||||
time.next.page.edit = min(time.till.next.edit, na.rm=TRUE),
|
||||
BRD.initiation = any(editor.talks &
|
||||
(editor.talks.first |
|
||||
!reverter.talks), na.rm = TRUE),
|
||||
|
||||
BRD.reciprocation = any(editor.talks &
|
||||
editor.talks.first &
|
||||
reverter.talks, na.rm = TRUE),
|
||||
reverter.initates.BRD = any(reverter.talks & (!editor.talks.first |
|
||||
is.na(editor.talks.first)),na.rm=TRUE),
|
||||
time.first.edit = first(time.first.edit),
|
||||
time.till.page.edit = min(time.till.page.edit),
|
||||
last.edit.to.page = all(last.edit.to.page),
|
||||
end.newcomer.period = first(end.newcomer.period),
|
||||
week = first(week),
|
||||
year = first(year(time.first.edit)),
|
||||
newcomer.edits = .N,
|
||||
session.edits = first(n.first.session),
|
||||
ns0.edits = sum(namespace == 0),
|
||||
ns1.edits = sum(namespace == 1),
|
||||
ns4.edits = sum(namespace == 4),
|
||||
newcomer.chars.change = sum(chars.change),
|
||||
newcomer.creates.article = any(creates.article),
|
||||
wiki.type = first(wiki.type),
|
||||
wiki.age = first(wiki.age)
|
||||
),
|
||||
by = .(wiki.name, editor)
|
||||
]
|
||||
|
||||
|
||||
newcomers[editor.variables,":="(survives = survives,is.bot=is.bot,is.admin=is.admin), on=.(wiki.name,editor)]
|
||||
|
||||
newcomers <- newcomers[!is.bot & !is.admin]
|
||||
print(" done")
|
||||
print(" identifying newcomer activity on other wikis")
|
||||
|
||||
newcomer.prior.wikis <- first.session.edits[newcomers,
|
||||
.(
|
||||
editor = editor,
|
||||
wiki.name = i.wiki.name,
|
||||
other.wiki = x.wiki.name,
|
||||
time.first.edit.this = i.time.first.edit,
|
||||
time.first.edit.other = x.time.first.edit
|
||||
|
||||
),
|
||||
on=.(wiki.type,editor,time.first.edit < time.first.edit),
|
||||
nomatch=0L,
|
||||
allow.cartesian = TRUE
|
||||
]
|
||||
|
||||
# using < time first edit should exlude edits to this wiki
|
||||
newcomer.prior.wikis <- newcomer.prior.wikis[,.(n.edits.other = .N),
|
||||
by=.(editor,wiki.name,other.wiki)]
|
||||
|
||||
newcomer.prior.wikis <- newcomer.prior.wikis[,
|
||||
.(n.other.wikis = .N,
|
||||
n.edits.other = sum(n.edits.other)),
|
||||
by=.(wiki.name,editor)]
|
||||
|
||||
newcomer.prior.wikis <- newcomer.prior.wikis[newcomers,
|
||||
.(
|
||||
wiki.name=wiki.name,
|
||||
editor=editor,
|
||||
n.other.wikis = n.other.wikis,
|
||||
n.edits.other = n.edits.other,
|
||||
has.edited.other.wikis = (n.other.wikis > 0) & (!is.na(n.other.wikis))),
|
||||
on=.(wiki.name,editor),
|
||||
nomatch=NA]
|
||||
|
||||
newcomers <- newcomers[newcomer.prior.wikis,
|
||||
":="(n.other.wikis = ifelse(is.na(i.n.other.wikis),0,i.n.other.wikis),
|
||||
n.edits.other = ifelse(is.na(i.n.edits.other),0,i.n.edits.other),
|
||||
has.edited.other.wikis = (i.n.other.wikis > 0) & (!is.na(i.n.other.wikis))),
|
||||
on=.(wiki.name, editor)
|
||||
]
|
||||
|
||||
newcomers[,":="(has.edited.other.wikis = ifelse(is.na(has.edited.other.wikis),FALSE,has.edited.other.wikis),
|
||||
n.edits.other = ifelse(is.na(n.edits.other),0,n.edits.other),
|
||||
n.other.wikis = ifelse(is.na(n.other.wikis),0,n.other.wikis)
|
||||
)]
|
||||
|
||||
print(" done")
|
||||
print(" identifying all messages")
|
||||
|
||||
user.talk.edits <- all.edits[namespace==3]
|
||||
|
||||
user.talk.edits[,user.talk:=title]
|
||||
|
||||
newcomers[,user.talk:= as.factor(paste0("User talk:",as.character(editor)))]
|
||||
|
||||
newcomer.messages <- user.talk.edits[newcomers,
|
||||
.(
|
||||
editor = i.editor,
|
||||
n.messages = .N,
|
||||
end.newcomer.period = i.end.newcomer.period
|
||||
),
|
||||
on=.(wiki.name,user.talk,date.time <= end.newcomer.period),
|
||||
by=.EACHI,
|
||||
nomatch=0L]
|
||||
|
||||
newcomer.messages <- newcomer.messages[newcomers,
|
||||
.(wiki.name,
|
||||
editor,
|
||||
n.messages = x.n.messages,
|
||||
is.messaged = (x.n.messages > 0) & (!is.na(x.n.messages))),
|
||||
on=.(wiki.name,editor),
|
||||
nomatch = NA]
|
||||
|
||||
newcomers <- newcomers[newcomer.messages,
|
||||
":="(n.messages = ifelse(is.na(i.n.messages),0L,i.n.messages),
|
||||
is.messaged = ifelse(is.na(i.n.messages),FALSE,i.is.messaged)),
|
||||
on=.(wiki.name,editor)]
|
||||
|
||||
last.edit <- max(all.edits$date.time)
|
||||
last.wikia.edit <- max(all.edits[wiki.type=="wikia",date.time])
|
||||
newcomers <- newcomers[time.first.edit < last.edit - as.difftime(60,units="days")]
|
||||
newcomers <- newcomers[(wiki.type == "wikia") & (time.first.edit < (last.wikia.edit - as.difftime(60,units="days")))]
|
||||
|
||||
print(" done")
|
||||
return(newcomers)
|
||||
}
|
||||
|
||||
|
||||
build.namespace4.dataset <- function(all.edits, week.length = as.difftime(7,units="days")){
|
||||
ns4.reg.edits <- all.edits[(namespace==4) & (anon==FALSE)]
|
||||
|
||||
return(ns4.reg.edits)
|
||||
}
|
||||
|
||||
|
||||
build.wiki.level.variables <- function(all.edits, week.length = as.difftime(7,units="days")){
|
||||
|
||||
wiki.data <- all.edits[,.(n.editors = length(unique(editor)),
|
||||
total.wiki.length=last(total.wiki.length)
|
||||
)
|
||||
,by=.(wiki.name,week)]
|
||||
|
||||
wiki.ns4.data <- all.edits[namespace==4,
|
||||
.(n.ns4.edits = .N,
|
||||
n.ns4.editors = length(unique(editor)),
|
||||
d.ns4.length = sum(chars.change),
|
||||
ns4.editor.age = mean(age)
|
||||
),
|
||||
by=.(wiki.name, week)]
|
||||
|
||||
wiki.ns0.data <- all.edits[namespace==0,
|
||||
.(revert.rate = mean(reverted,na.rm=TRUE),
|
||||
newcomer.revert.rate = sum((reverted & is.newcomer),na.rm=TRUE)/sum(is.newcomer,na.rm=TRUE),
|
||||
revert.disc.rate = sum((reverted & reverter.talks),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
|
||||
newcomer.revert.disc.rate = sum((reverted & reverter.talks & is.newcomer),na.rm=TRUE)/ sum(reverted & is.newcomer,na.rm=TRUE),
|
||||
revert.message.rate = sum((reverted & reverter.messages),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
|
||||
newcomer.revert.message.rate = sum((reverted & reverter.messages & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
|
||||
newcomer.edits.rate = mean(is.newcomer,na.rm=TRUE),
|
||||
bot.revert.rate = mean(reverted.by.bot,na.rm=TRUE),
|
||||
bot.revert.prop = sum(reverted.by.bot,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
|
||||
newcomer.bot.revert.rate = mean((reverted.by.bot & is.newcomer),na.rm=TRUE),
|
||||
newcomer.bot.revert.prop = sum((reverted.by.bot & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
|
||||
admin.revert.rate = mean(reverted.by.admin,na.rm=TRUE),
|
||||
admin.revert.prop = sum(reverted.by.admin,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
|
||||
year = year(first(date.time)),
|
||||
month = month(first(date.time))),
|
||||
by=.(wiki.name,week)]
|
||||
|
||||
## replace NAs with 0
|
||||
wiki.ns0.data[,
|
||||
":="(
|
||||
# revert.rate = ifelse(is.na(revert.rate),0,revert.rate),
|
||||
revert.disc.rate = ifelse(is.na(revert.disc.rate),0,revert.disc.rate),
|
||||
newcomer.revert.disc.rate = ifelse(is.na(newcomer.revert.disc.rate),0,newcomer.revert.disc.rate),
|
||||
revert.message.rate = ifelse(is.na(revert.message.rate),0,revert.message.rate),
|
||||
newcomer.revert.message.rate = ifelse(is.na(newcomer.revert.message.rate),0,newcomer.revert.message.rate),
|
||||
newcomer.edits.rate = ifelse(is.na(newcomer.edits.rate),0,newcomer.edits.rate),
|
||||
bot.revert.rate = ifelse(is.na(bot.revert.rate),0,bot.revert.rate),
|
||||
bot.revert.prop = ifelse(is.na(bot.revert.prop),0,bot.revert.prop),
|
||||
newcomer.bot.revert.rate = ifelse(is.na(newcomer.bot.revert.rate),0,newcomer.bot.revert.rate),
|
||||
newcomer.bot.revert.prop = ifelse(is.na(newcomer.bot.revert.prop),0,newcomer.bot.revert.prop),
|
||||
admin.revert.rate = ifelse(is.na(admin.revert.rate),0,admin.revert.rate),
|
||||
admin.revert.prop = ifelse(is.na(admin.revert.prop),0,admin.revert.prop)),
|
||||
]
|
||||
|
||||
## bring it together
|
||||
wiki.data[wiki.ns0.data,
|
||||
":="(
|
||||
revert.rate = i.revert.rate,
|
||||
revert.disc.rate = i.revert.disc.rate,
|
||||
newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
|
||||
revert.message.rate = i.revert.message.rate,
|
||||
newcomer.revert.message.rate = i.newcomer.revert.message.rate,
|
||||
newcomer.edits.rate = i.newcomer.edits.rate,
|
||||
bot.revert.rate = i.bot.revert.rate,
|
||||
bot.revert.prop = i.bot.revert.prop,
|
||||
newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
|
||||
newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
|
||||
admin.revert.rate = i.admin.revert.rate,
|
||||
admin.revert.prop = i.admin.revert.prop),
|
||||
on=.(wiki.name,week)]
|
||||
|
||||
wiki.data[wiki.ns4.data,
|
||||
":="(
|
||||
n.ns4.edits = i.n.ns4.edits,
|
||||
n.ns4.editors = i.n.ns4.editors,
|
||||
d.ns4.length = i.d.ns4.length,
|
||||
ns4.editor.age = i.ns4.editor.age
|
||||
),
|
||||
on=.(wiki.name,week)]
|
||||
|
||||
# create variables for community size in standard deviation units
|
||||
return(wiki.data)
|
||||
}
|
||||
|
||||
|
||||
load.all.edits <- function(){
|
||||
if(!exists("all.edits")){
|
||||
file.name <- "all.edits.RDS"
|
||||
if(!file.exists(file.name)){
|
||||
print("loading wikiq data")
|
||||
|
||||
all.edits <- build.newcomer.table.step1(wiki.list, newcomer.period = newcomer.period)
|
||||
|
||||
print("done")
|
||||
|
||||
print("adding user role data")
|
||||
all.edits <- add.userroles(all.edits,bots=bots,admins=admins)
|
||||
print("done")
|
||||
|
||||
print("identifying reverts and messages")
|
||||
all.edits <- identify.revert.messages(all.edits,week.length=as.difftime(7,units="days"))
|
||||
print("done")
|
||||
if(!nosave){
|
||||
print("saving work")
|
||||
saveRDS(all.edits,file.name)
|
||||
print("done")
|
||||
}
|
||||
} else{
|
||||
print("loading wikiq data with reverts and messages")
|
||||
all.edits <- readRDS(file.name)
|
||||
print("done")
|
||||
}
|
||||
|
||||
remember(min(all.edits$date.time),"earliest.data.point")
|
||||
remember(max(all.edits$date.time),"latest.data.point")
|
||||
|
||||
## make all.edits a global variable
|
||||
all.edits <<- all.edits
|
||||
}
|
||||
}
|
||||
|
||||
newcomer.period = duration(2*30,unit="days")
|
||||
newcomer.sunset = duration(30*6,unit="days")
|
||||
week.length=duration(7,unit="days")
|
||||
remember(newcomer.period)
|
||||
remember(newcomer.sunset)
|
||||
remember(week.length)
|
||||
|
||||
## try loading newcomers
|
||||
|
||||
if(!exists("newcomers")){
|
||||
file.name2 <- "newcomers.RDS"
|
||||
if(file.exists(file.name2)){
|
||||
newcomers <- readRDS(file.name2)
|
||||
} else{
|
||||
print("building newcomers table")
|
||||
load.all.edits()
|
||||
|
||||
newcomers <- build.newcomers(all.edits,
|
||||
newcomer.sunset = newcomer.sunset,
|
||||
newcomer.period=newcomer.period)
|
||||
|
||||
print("done")
|
||||
print("saving work")
|
||||
if(!nosave){
|
||||
saveRDS(newcomers,file.name2)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(!exists("ns4.reg.edits")){
|
||||
file.name <- "ns4.reg.edits.RDS"
|
||||
if(file.exists(file.name)){
|
||||
ns4.reg.edits <- readRDS(file.name)
|
||||
} else{
|
||||
print("building ns4 edits table")
|
||||
|
||||
## create table of namespace 4 edits from all edits
|
||||
load.all.edits()
|
||||
ns4.reg.edits <- build.namespace4.dataset(all.edits)
|
||||
print("done")
|
||||
print("saving work")
|
||||
if(!nosave){
|
||||
saveRDS(ns4.reg.edits,file.name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!exists("wiki.data")){
|
||||
file.name3 <- "wikiweeks.RDS"
|
||||
if(!file.exists(file.name3)){
|
||||
print("building wiki level variable")
|
||||
load.all.edits()
|
||||
wiki.data <- build.wiki.level.variables(all.edits, week.length=week.length)
|
||||
print("done")
|
||||
print("saving work")
|
||||
if(!nosave){
|
||||
saveRDS(wiki.data,file.name3)
|
||||
}
|
||||
print("done")
|
||||
}
|
||||
else{
|
||||
wiki.data <- readRDS(file.name3)
|
||||
}
|
||||
}
|
||||
|
||||
#wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name
|
||||
#remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers")
|
||||
#newcomers <- newcomers[!(wiki.name %in% wikis.to.remove)]
|
||||
#all.edits <- all.edits[!(wiki.name %in% wikis.to.remove)]
|
||||
if(!exists("wiki.stats")){
|
||||
file.name <- "wiki.stats.RDS"
|
||||
if(!file.exists(file.name)){
|
||||
load.all.edits()
|
||||
|
||||
editor.tenures <- all.edits[,.(tenure=first(editor.tenure)),by=.(wiki.name,editor)]
|
||||
wiki.stats <- all.edits[,.(total.editors = length(unique(editor)),
|
||||
total.edits = .N,
|
||||
total.reverts = sum(reverted),
|
||||
total.bot.reverts = sum(reverted.by.bot,na.rm=TRUE),
|
||||
total.ns4.edits = nrow(.SD[namespace==4]),
|
||||
med.edit.tenure = median(editor.tenure)
|
||||
),by=.(wiki.name)]
|
||||
|
||||
med.editor.tenure <- editor.tenures[,.(med.editor.tenure=median(tenure)),by=.(wiki.name)]
|
||||
|
||||
wiki.stats[med.editor.tenure,med.tenure := med.editor.tenure,on="wiki.name"]
|
||||
newcomer.stats <- newcomers[,.(retention.rate = mean(survives),
|
||||
reverted.newcomers = sum(is.reverted)
|
||||
),by=.(wiki.name)]
|
||||
wiki.stats <- wiki.stats[newcomer.stats,':='(retention.rate = retention.rate, reverted.newcomers = reverted.newcomers), on="wiki.name"]
|
||||
remember(wiki.stats,silent=TRUE)
|
||||
saveRDS(wiki.stats,file.name)
|
||||
} else {
|
||||
wiki.stats <- readRDS("wiki.stats.RDS")
|
||||
}
|
||||
}
|
||||
|
||||
row1 <- c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits")
|
||||
row2 <- c("med.editor.tenure","retention.rate")
|
||||
m.wiki.stats <- melt(wiki.stats,id='wiki.name',measure.vars = c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits"))
|
||||
m.wiki.stats[variable %in% row1, ":="(row = 1,col=which(row1 == variable,useNames=F)),by=variable]
|
||||
m.wiki.stats[variable %in% row2, ":="(row = 2,col=which(row2 == variable,useNames=F)),by=variable]
|
||||
|
||||
m.wiki.stats <- m.wiki.stats[value != 0 | variable != "total.bot.reverts"]
|
||||
m.wiki.stats <- m.wiki.stats[value == 0 & variable != "total.bot.reverts", value := 1]
|
||||
|
||||
friendly.var <- function(varname){
|
||||
sapply(as.character(varname),function(f) switch(f,
|
||||
total.editors='Editors',
|
||||
total.reverts='Reverts',
|
||||
total.bot.reverts='Bot reverts',
|
||||
total.ns4.edits='Edits to the project namespace'))
|
||||
}
|
||||
|
||||
var.id <- function(varname){
|
||||
sapply(as.character(varname),function(f) switch(f,
|
||||
total.editors=1,
|
||||
total.reverts=2,
|
||||
total.bot.reverts=3,
|
||||
total.ns4.edits=4))
|
||||
}
|
||||
|
||||
med.line.width <- 1
|
||||
m.wiki.stats[,variable := friendly.var(variable)]
|
||||
m.wiki.stats <- m.wiki.stats[,variable:=factor(variable,levels=c('Editors',"Reverts","Bot reverts","Edits to the project namespace"))]
|
||||
|
||||
spoke.data <- m.wiki.stats[,.(y = median(value)),by=variable]
|
||||
remember(m.wiki.stats)
|
||||
remember(spoke.data)
|
||||
remember(nrow(wiki.stats),"n.wikia.wikis")
|
||||
|
||||
## join wiki-level variables with newcomer variables to get ready to model newcomer retention.
|
||||
newcomers <- newcomers[wiki.data,
|
||||
":="(
|
||||
wiki.name=i.wiki.name,
|
||||
week = i.week,
|
||||
n.editors = i.n.editors,
|
||||
total.wiki.length = i.total.wiki.length,
|
||||
revert.rate = i.revert.rate,
|
||||
revert.disc.rate = i.revert.disc.rate,
|
||||
newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
|
||||
revert.message.rate = i.revert.message.rate,
|
||||
newcomer.revert.message.rate = i.newcomer.revert.message.rate,
|
||||
newcomer.edits.rate = i.newcomer.edits.rate,
|
||||
bot.revert.rate = i.bot.revert.rate,
|
||||
bot.revert.prop = i.bot.revert.prop,
|
||||
newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
|
||||
newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
|
||||
admin.revert.rate = i.admin.revert.rate,
|
||||
admin.revert.prop = i.admin.revert.prop,
|
||||
n.ns4.edits = i.n.ns4.edits,
|
||||
n.ns4.editors = i.n.ns4.editors,
|
||||
d.ns4.length = i.d.ns4.length,
|
||||
ns4.editor.age = i.ns4.editor.age,
|
||||
wiki.age.weeks = as.double(wiki.age,units='days')/7,
|
||||
wiki.age.months = floor(as.double(wiki.age,units='days')/30),
|
||||
wiki.age.half.years = floor(as.double(wiki.age,units='years')*2),
|
||||
wiki.age.years = floor(as.double(wiki.age,units='years')),
|
||||
quarter = factor(floor_date(time.first.edit,unit="3 months"))
|
||||
),
|
||||
on=.(wiki.name,week)
|
||||
]
|
||||
|
||||
|
||||
survival.data <- newcomers[,.(wiki.name,
|
||||
week,
|
||||
survival.rate = mean(survives),
|
||||
n.newcomers = .N),
|
||||
by = .(wiki.name, week)]
|
||||
wiki.data <- wiki.data[survival.data,
|
||||
":="(
|
||||
survival.rate = survival.rate,
|
||||
n.newcomers = n.newcomers),
|
||||
on = .(wiki.name,week)]
|
||||
|
||||
file.name <- "active.editors.RDS"
|
||||
if(!file.exists(file.name)){
|
||||
load.all.edits()
|
||||
active.editors <- all.edits[,
|
||||
.(N.edits=.N,
|
||||
wiki.age.years=first(wiki.age.years)),
|
||||
by=.(wiki.name,
|
||||
editor,
|
||||
wiki.age.months)]
|
||||
saveRDS(active.editors, file.name)
|
||||
|
||||
} else {
|
||||
active.editors <- readRDS(file.name)
|
||||
}
|
||||
Reference in New Issue
Block a user