1
0

initial import of material for public archive into git

We're creating a fresh archive because the history for our old chapter includes
API keys, data files, and other material we can't share.
This commit is contained in:
2018-01-21 17:15:51 -08:00
commit dd420c77de
41 changed files with 7069 additions and 0 deletions

View File

@@ -0,0 +1,56 @@
library(data.table)
# import ngram data
# note that the file is not pushed to repository, but is available on
# hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
# Top 100,000 ngrams (?)
ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
header=TRUE, stringsAsFactors=FALSE)[,-3]
names(ngrams)[1] <- "eid"
subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
stringsAsFactors=FALSE, sep="\t")[,c("eid",
"first_ASJC_subject_area")]
names(subjects)[2] <- "subject"
# takes a couple of minutes:
ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
# only use ngrams that occur accross all (many?) subject areas
subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
length(unique(x)))
# summary(subject.by.ngram)
#
# library(txtplot)
# txtdensity(log(subject.by.ngram))
# Note:
# The median number of subject areas per term is five. We'll cut it
# off at terms that occur across at least 30 subject areas.
top.ngrams <- ngrams[ngrams$term %in%
names(subject.by.ngram[subject.by.ngram >
30]),c("eid", "term")]
rm(ngrams, subject.by.ngram, subjects)
# convert to a wide format matrix of dichotomous variables
library(reshape2)
library(data.table)
top.ngrams <- data.table(top.ngrams)
setkey(top.ngrams, eid)
top.ngrams[,vv:= TRUE]
# took more than 20 minutes on hyak
top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
value.var = "vv")
rm(top.ngrams)
save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
#load("processed_data/top.ngram.matrix.RData")