We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
57 lines
1.6 KiB
R
57 lines
1.6 KiB
R
library(data.table)
|
|
|
|
|
|
# import ngram data
|
|
# note that the file is not pushed to repository, but is available on
|
|
# hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
|
|
|
|
# Top 100,000 ngrams (?)
|
|
ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
|
|
header=TRUE, stringsAsFactors=FALSE)[,-3]
|
|
names(ngrams)[1] <- "eid"
|
|
|
|
subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
|
|
stringsAsFactors=FALSE, sep="\t")[,c("eid",
|
|
"first_ASJC_subject_area")]
|
|
names(subjects)[2] <- "subject"
|
|
|
|
# takes a couple of minutes:
|
|
ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
|
|
|
|
# only use ngrams that occur accross all (many?) subject areas
|
|
subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
|
|
length(unique(x)))
|
|
|
|
# summary(subject.by.ngram)
|
|
#
|
|
# library(txtplot)
|
|
# txtdensity(log(subject.by.ngram))
|
|
|
|
# Note:
|
|
# The median number of subject areas per term is five. We'll cut it
|
|
# off at terms that occur across at least 30 subject areas.
|
|
|
|
top.ngrams <- ngrams[ngrams$term %in%
|
|
names(subject.by.ngram[subject.by.ngram >
|
|
30]),c("eid", "term")]
|
|
|
|
rm(ngrams, subject.by.ngram, subjects)
|
|
|
|
# convert to a wide format matrix of dichotomous variables
|
|
library(reshape2)
|
|
library(data.table)
|
|
|
|
top.ngrams <- data.table(top.ngrams)
|
|
setkey(top.ngrams, eid)
|
|
|
|
top.ngrams[,vv:= TRUE]
|
|
|
|
# took more than 20 minutes on hyak
|
|
top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
|
|
value.var = "vv")
|
|
|
|
rm(top.ngrams)
|
|
|
|
save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
|
|
#load("processed_data/top.ngram.matrix.RData")
|