Initial commit
p# new file: runwikiq.sh
This commit is contained in:
59
RCommunityData/R/namespaces.R
Normal file
59
RCommunityData/R/namespaces.R
Normal file
@@ -0,0 +1,59 @@
|
||||
# Community Data Science Collective R Utilities
|
||||
#
|
||||
# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
|
||||
# mako@atdot.cc, aaronshaw@northwestern.edu
|
||||
|
||||
## functions to deal with namespace information
|
||||
#####################################################################
|
||||
load.wikia.namespaces <- function () {
|
||||
# load namespace data
|
||||
wikia.namespaces <- read.delim("~/data/wikia_namespaces.tsv",
|
||||
stringsAsFactors=TRUE, header=FALSE)
|
||||
|
||||
colnames(wikia.namespaces) <- c("wiki", "ns.num", "ns.string")
|
||||
wikia.namespaces$ns.num <- as.factor(wikia.namespaces$ns.num)
|
||||
return(wikia.namespaces)
|
||||
}
|
||||
|
||||
# enwiki - move to barnstars directory
|
||||
# TODO: TEST
|
||||
load.enwiki.namespaces <- function(){
|
||||
enwiki.ns.num <- c(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
|
||||
14, 15, 100, 101, 108, 109)
|
||||
|
||||
names(enwiki.ns.num) <- c( "Media", "Special", "", "Talk", "User", "User talk",
|
||||
"Wikipedia", "Wikipedia talk","File", "File talk",
|
||||
"MediaWiki", "MediaWiki talk", "Template", "Template talk",
|
||||
"Help", "Help talk", "Category", "Category talk",
|
||||
"Portal", "Portal talk", "Book","Book talk")
|
||||
}
|
||||
|
||||
# function to take a list of article titles and a wiki name and return
|
||||
# a list of numbered namespaces
|
||||
titles.to.ns.num <- function (page.titles, wiki) {
|
||||
# load wikia namespace data from disk if it does not exist
|
||||
if (!exists("wikia.namespaces")) {
|
||||
wikia.namespaces <- load.wikia.namespaces()
|
||||
}
|
||||
|
||||
# page.titles <- d$title # DEBUG
|
||||
ns.df <- wikia.namespaces[wikia.namespaces$wiki == wiki,
|
||||
c("ns.num", "ns.string")]
|
||||
|
||||
namespaces <- as.character(ns.df$ns.num)
|
||||
names(namespaces) <- ns.df$ns.string
|
||||
|
||||
# drop the zero, we'll deal with it later
|
||||
namespaces <- namespaces [!namespaces == 0]
|
||||
|
||||
# change underscores to spaces (necessary?)
|
||||
page.titles <- gsub('_', ' ', page.titles)
|
||||
page.ns <- rep("0", length(page.titles))
|
||||
|
||||
for (ns in names(namespaces)) {
|
||||
page.ns[grepl(paste('^', ns, ':', sep=""), page.titles)] <- namespaces[ns]
|
||||
}
|
||||
|
||||
# return the list of namespaces as a factor
|
||||
return(as.factor(page.ns))
|
||||
}
|
||||
Reference in New Issue
Block a user