initial import of material for public archive into git

We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00
commit dd420c77de
41 changed files with 7069 additions and 0 deletions
--- a/code/prediction/02-build_textual_features.R
+++ b/code/prediction/02-build_textual_features.R
@@ -0,0 +1,56 @@
+library(data.table)
+
+
+# import ngram data
+# note that the file is not pushed to repository, but is available on
+# hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
+
+# Top 100,000 ngrams (?)
+ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
+                     header=TRUE, stringsAsFactors=FALSE)[,-3]
+names(ngrams)[1] <- "eid"
+
+subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
+                         stringsAsFactors=FALSE, sep="\t")[,c("eid",
+                         "first_ASJC_subject_area")]
+names(subjects)[2] <- "subject"
+
+# takes a couple of minutes:
+ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
+
+# only use ngrams that occur accross all (many?) subject areas
+subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
+    length(unique(x)))
+
+# summary(subject.by.ngram)
+#
+# library(txtplot)
+# txtdensity(log(subject.by.ngram))
+
+# Note:
+# The median number of subject areas per term is five. We'll cut it
+# off at terms that occur across at least 30 subject areas.
+
+top.ngrams <- ngrams[ngrams$term %in%
+                     names(subject.by.ngram[subject.by.ngram >
+                     30]),c("eid", "term")]
+
+rm(ngrams, subject.by.ngram, subjects)
+
+# convert to a wide format matrix of dichotomous variables
+library(reshape2)
+library(data.table)
+
+top.ngrams <- data.table(top.ngrams)
+setkey(top.ngrams, eid)
+
+top.ngrams[,vv:= TRUE]
+
+# took more than 20 minutes on hyak
+top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
+                          value.var = "vv")
+
+rm(top.ngrams)
+
+save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
+#load("processed_data/top.ngram.matrix.RData")