32 lines
1.6 KiB
R
32 lines
1.6 KiB
R
library(MASS)
|
|
library(brms)
|
|
options(mc.cores=28)
|
|
|
|
library(data.table)
|
|
library(arrow)
|
|
|
|
sample.params <- readRDS("remember_sample_quality_labels.RDS")
|
|
|
|
df <- data.table(read_feather("data/scored_article_sample.feather"))
|
|
wp10dict <- list('start','stub','c','b','a','ga','fa')
|
|
df[,wp10:=wp10dict[wp10]]
|
|
df <- df[,wp10:=factor(wp10,levels=c('stub','start','c','b','a','ga','fa'),ordered=TRUE)]
|
|
## remove 'a' class articles for a fair comparison.
|
|
df <- df[wp10!='a']
|
|
df <- df[,datetime := as.POSIXct(timestamp,format="%Y%m%d%H%M%S")]
|
|
df <- df[,datetime.numeric := as.numeric(timestamp)]
|
|
df <- df[,datetime.numeric := (datetime.numeric - min(datetime.numeric))]
|
|
df <- df[,datetime.numeric := datetime.numeric/max(datetime.numeric)]
|
|
|
|
data.counts <- data.table(sample.params$label_sample_counts)
|
|
#data.counts <- data.counts[,wp10:=factor(wp10,levels=c('stub','start','c','b','a','ga','fa'),ordered=TRUE)]
|
|
data.counts <- data.counts[,wp10:=factor(wp10,levels=c('stub','start','c','b','a','ga','fa'),ordered=TRUE)]
|
|
sample.counts <- df[,.(.N),by=.(wp10)][order(wp10)]
|
|
#sample.counts <- sample.counts[,wp10:=factor(wp10,levels=c('stub','start','c','b','a','ga','fa'),ordered=TRUE)]
|
|
sample.counts <- sample.counts[,wp10:=factor(wp10,levels=c('stub','start','c','b','ga','fa'),ordered=TRUE)]
|
|
weights <- data.counts[sample.counts,on=.(wp10)]
|
|
weights <- weights[,article_weight:=(n_articles/sum(weights$n_articles))/(N/sum(weights$N))]
|
|
weights <- weights[,revision_weight:=(n_revisions/sum(weights$n_revisions))/(N/sum(weights$N))]
|
|
df <- df[weights,on=.(wp10)]
|
|
df[,quality.even6 := apply(df[,.(Stub,Start,B,C,GA,FA)],1,function(r) r %*% c(1,2,3,4,5,6))]
|