Initial commit
p# new file: runwikiq.sh
This commit is contained in:
44
00_select_wikis.R
Executable file
44
00_select_wikis.R
Executable file
@@ -0,0 +1,44 @@
|
||||
#!usr/bin/env Rscript
|
||||
|
||||
## Script used to choose the top 1% of wikis to analyze
|
||||
|
||||
# Copyright (C) 2018 Nathan TeBlunthuis
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
library("ggplot2")
|
||||
library("data.table")
|
||||
counts.dir <- "../wikiq_wikia_2010_unique_editors/"
|
||||
files <- list.files(counts.dir)
|
||||
read.count.file <- function(f){
|
||||
return(read.csv(paste0(counts.dir,f),header=FALSE))
|
||||
}
|
||||
dbname <- gsub("\\.editors",'',files)
|
||||
counts <- c(sapply(files,read.count.file))
|
||||
counts <- unlist(counts,use.names=FALSE)
|
||||
dt <- data.table(wiki=dbname,n.editors=counts)
|
||||
|
||||
|
||||
#ggplot(dt,aes(x=n.editors)) + stat_ecdf(geom="step") + scale_x_log10(minor_breaks=10**(1:10/2)) + scale_y_continuous(minor_breaks=1:20/20)
|
||||
|
||||
top_1_percentile = quantile(x=dt$n.editors,probs=(1:99)/100)[99]
|
||||
## lets take all with > 100. This is very close to the top 1%, but it involves nice round numbers :)
|
||||
|
||||
wiki.list <- dt[n.editors >= top_1_percentile]
|
||||
|
||||
wiki.list[is.na(url),':='(url=paste0("http://",wiki,".wikia.com/"))]
|
||||
wiki.list$wiki.type="wikia"
|
||||
|
||||
fwrite(wiki.list,"selected.wikis.csv")
|
||||
|
||||
Reference in New Issue
Block a user