initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
56
code/prediction/02-build_textual_features.R
Normal file
56
code/prediction/02-build_textual_features.R
Normal file
@@ -0,0 +1,56 @@
|
||||
library(data.table)
|
||||
|
||||
|
||||
# import ngram data
|
||||
# note that the file is not pushed to repository, but is available on
|
||||
# hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
|
||||
|
||||
# Top 100,000 ngrams (?)
|
||||
ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
|
||||
header=TRUE, stringsAsFactors=FALSE)[,-3]
|
||||
names(ngrams)[1] <- "eid"
|
||||
|
||||
subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
|
||||
stringsAsFactors=FALSE, sep="\t")[,c("eid",
|
||||
"first_ASJC_subject_area")]
|
||||
names(subjects)[2] <- "subject"
|
||||
|
||||
# takes a couple of minutes:
|
||||
ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
|
||||
|
||||
# only use ngrams that occur accross all (many?) subject areas
|
||||
subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
|
||||
length(unique(x)))
|
||||
|
||||
# summary(subject.by.ngram)
|
||||
#
|
||||
# library(txtplot)
|
||||
# txtdensity(log(subject.by.ngram))
|
||||
|
||||
# Note:
|
||||
# The median number of subject areas per term is five. We'll cut it
|
||||
# off at terms that occur across at least 30 subject areas.
|
||||
|
||||
top.ngrams <- ngrams[ngrams$term %in%
|
||||
names(subject.by.ngram[subject.by.ngram >
|
||||
30]),c("eid", "term")]
|
||||
|
||||
rm(ngrams, subject.by.ngram, subjects)
|
||||
|
||||
# convert to a wide format matrix of dichotomous variables
|
||||
library(reshape2)
|
||||
library(data.table)
|
||||
|
||||
top.ngrams <- data.table(top.ngrams)
|
||||
setkey(top.ngrams, eid)
|
||||
|
||||
top.ngrams[,vv:= TRUE]
|
||||
|
||||
# took more than 20 minutes on hyak
|
||||
top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
|
||||
value.var = "vv")
|
||||
|
||||
rm(top.ngrams)
|
||||
|
||||
save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
|
||||
#load("processed_data/top.ngram.matrix.RData")
|
||||
Reference in New Issue
Block a user