initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
89
code/prediction/00_ngram_extraction.py
Normal file
89
code/prediction/00_ngram_extraction.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from time import time
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
import csv
|
||||
import argparse
|
||||
|
||||
n_features = 100000 # Gets the top n_features terms
|
||||
n_samples = None # Enter an integer here for testing, so it doesn't take so long
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts')
|
||||
parser.add_argument('-i', help='Location of the abstracts file',
|
||||
default='processed_data/abstracts.tsv')
|
||||
parser.add_argument('-o', help='Location of the output file',
|
||||
default='processed_data/ngram_table.csv')
|
||||
parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams',
|
||||
default=3)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading dataset...")
|
||||
t0 = time()
|
||||
doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
# Write the header
|
||||
write_header(args.o)
|
||||
|
||||
bags_o_words = get_counts(data_samples, n_features, args.n)
|
||||
write_output(doc_ids, bags_o_words, args.o)
|
||||
|
||||
def get_counts(abstracts, n_features, ngram_max):
|
||||
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
|
||||
max_features=n_features,
|
||||
stop_words='english',
|
||||
ngram_range = (1,ngram_max))
|
||||
t0 = time()
|
||||
tf = tf_vectorizer.fit_transform(abstracts)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
terms = tf_vectorizer.get_feature_names()
|
||||
freqs = tf.toarray()
|
||||
bags_o_words = to_bags_o_words(terms, freqs)
|
||||
return bags_o_words
|
||||
|
||||
|
||||
def write_header(out_file):
|
||||
with open(out_file, 'w') as o_f:
|
||||
out = csv.writer(o_f)
|
||||
out.writerow(['document_id','term','frequency'])
|
||||
|
||||
def to_bags_o_words(terms, freqs):
|
||||
'''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document.
|
||||
The format of the dictionaries is term:count within that document.
|
||||
'''
|
||||
result = []
|
||||
for d in freqs:
|
||||
curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 }
|
||||
result.append(curr_result)
|
||||
return result
|
||||
|
||||
def write_output(ids, bags_o_words, out_file):
|
||||
with open(out_file, 'a') as o_f:
|
||||
out = csv.writer(o_f)
|
||||
for i, doc in enumerate(bags_o_words):
|
||||
for k,v in doc.items():
|
||||
# For each term and count, output a row, together with the document id
|
||||
out.writerow([ids[i],k,v])
|
||||
|
||||
def get_ids_and_abstracts(fn, length_limit):
|
||||
with open(fn, 'r') as f:
|
||||
in_csv = csv.DictReader(f, delimiter='\t')
|
||||
abstracts = []
|
||||
ids = []
|
||||
i = 1
|
||||
for r in in_csv:
|
||||
try:
|
||||
abstracts.append(r['abstract'])
|
||||
ids.append(r['eid'])
|
||||
except KeyError:
|
||||
print(r)
|
||||
if length_limit and i > length_limit:
|
||||
break
|
||||
i += 1
|
||||
return ids, abstracts
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
89
code/prediction/01-build_control_variables.R
Normal file
89
code/prediction/01-build_control_variables.R
Normal file
@@ -0,0 +1,89 @@
|
||||
source("code/prediction/utils.R")
|
||||
|
||||
# use this to store things for use in the paper
|
||||
pred.descrip <- NULL
|
||||
|
||||
abstracts <- read.delim("processed_data/abstracts.tsv", header=TRUE,
|
||||
stringsAsFactors=FALSE, sep="\t")
|
||||
|
||||
abstracts <- subset(abstracts, select = -abstract)
|
||||
|
||||
abstracts <- abstracts[abstracts$aggregation_type != "Trade Journal" &
|
||||
is.na(abstracts$aggregation_type) == FALSE, ]
|
||||
|
||||
names(abstracts)[names(abstracts) == 'num_citations'] <- 'works_cited'
|
||||
abstracts$works_cited[is.na(abstracts$works_cited) == TRUE] <- 0
|
||||
|
||||
# affiliations
|
||||
affiliations <- read.delim("processed_data/paper_aff_table.tsv",
|
||||
header=TRUE, stringsAsFactors=FALSE,
|
||||
sep="\t")
|
||||
|
||||
# eliminate missing values
|
||||
affiliations <- affiliations[!is.na(affiliations$affiliation_id) &
|
||||
affiliations$organization != "", ]
|
||||
|
||||
|
||||
remap.affiliations <- function(aff.id,
|
||||
aff.df = affiliations){
|
||||
org.modal <- names(tail(sort(table(affiliations$organization[
|
||||
affiliations$affiliation_id == aff.id])),1))
|
||||
return(org.modal)
|
||||
}
|
||||
|
||||
affiliations$organization <- sapply(affiliations$affiliation_id, remap.affiliations)
|
||||
|
||||
affiliations <- subset(affiliations, select = c(paper_eid,
|
||||
organization))
|
||||
names(affiliations) <- c("eid", "affiliation")
|
||||
|
||||
# need to remove repeat affiliations
|
||||
affiliations <- affiliations[duplicated(affiliations$eid) == FALSE,]
|
||||
|
||||
|
||||
######################################
|
||||
d <- abstracts[, c("eid", "language", "modal_country",
|
||||
"source_title", "works_cited")]
|
||||
|
||||
# dichotomous dependent variable
|
||||
d$cited <- abstracts$cited_by_count > 0
|
||||
|
||||
|
||||
# store this here for use in the paper before we run any restrictions:
|
||||
pred.descrip$cited <- d$cited
|
||||
pred.descrip$cites <- abstracts$cited_by_count
|
||||
|
||||
|
||||
# We want these to be categorical variables
|
||||
d$modal_country <- factor(d$modal_country)
|
||||
d$language <- factor(d$language)
|
||||
d$subject <- factor(abstracts$first_ASJC_subject_area)
|
||||
d$source_title <- factor(d$source_title)
|
||||
d$month <- factor(strftime(abstracts$date, format= "%m"))
|
||||
# except for pub year - keep that continuous
|
||||
d$year <- as.numeric(strftime(abstracts$date, format="%Y"))
|
||||
|
||||
# bring in org affiliations
|
||||
d <- merge(d, affiliations, by="eid") # note that this drops papers
|
||||
# w/out org info
|
||||
|
||||
d$affiliation <- factor(d$affiliation)
|
||||
|
||||
##### Restrictions:
|
||||
|
||||
### do this explicitly so that changes are easy:
|
||||
d <- restrict(d, d$affiliation, 1)
|
||||
d <- restrict(d, d$subject, 1)
|
||||
d <- restrict(d, d$source_title, 1)
|
||||
d <- restrict(d, d$language, 1)
|
||||
d <- restrict(d, d$modal_country, 1)
|
||||
|
||||
# n.authors
|
||||
# per author prior citations
|
||||
|
||||
pred.descrip$covars <- d
|
||||
save(pred.descrip, file = "paper/data/prediction_descriptives.RData")
|
||||
|
||||
|
||||
rm(d, abstracts, affiliations)
|
||||
|
||||
56
code/prediction/02-build_textual_features.R
Normal file
56
code/prediction/02-build_textual_features.R
Normal file
@@ -0,0 +1,56 @@
|
||||
library(data.table)
|
||||
|
||||
|
||||
# import ngram data
|
||||
# note that the file is not pushed to repository, but is available on
|
||||
# hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
|
||||
|
||||
# Top 100,000 ngrams (?)
|
||||
ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
|
||||
header=TRUE, stringsAsFactors=FALSE)[,-3]
|
||||
names(ngrams)[1] <- "eid"
|
||||
|
||||
subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
|
||||
stringsAsFactors=FALSE, sep="\t")[,c("eid",
|
||||
"first_ASJC_subject_area")]
|
||||
names(subjects)[2] <- "subject"
|
||||
|
||||
# takes a couple of minutes:
|
||||
ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
|
||||
|
||||
# only use ngrams that occur accross all (many?) subject areas
|
||||
subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
|
||||
length(unique(x)))
|
||||
|
||||
# summary(subject.by.ngram)
|
||||
#
|
||||
# library(txtplot)
|
||||
# txtdensity(log(subject.by.ngram))
|
||||
|
||||
# Note:
|
||||
# The median number of subject areas per term is five. We'll cut it
|
||||
# off at terms that occur across at least 30 subject areas.
|
||||
|
||||
top.ngrams <- ngrams[ngrams$term %in%
|
||||
names(subject.by.ngram[subject.by.ngram >
|
||||
30]),c("eid", "term")]
|
||||
|
||||
rm(ngrams, subject.by.ngram, subjects)
|
||||
|
||||
# convert to a wide format matrix of dichotomous variables
|
||||
library(reshape2)
|
||||
library(data.table)
|
||||
|
||||
top.ngrams <- data.table(top.ngrams)
|
||||
setkey(top.ngrams, eid)
|
||||
|
||||
top.ngrams[,vv:= TRUE]
|
||||
|
||||
# took more than 20 minutes on hyak
|
||||
top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
|
||||
value.var = "vv")
|
||||
|
||||
rm(top.ngrams)
|
||||
|
||||
save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
|
||||
#load("processed_data/top.ngram.matrix.RData")
|
||||
221
code/prediction/03-prediction_analysis.R
Normal file
221
code/prediction/03-prediction_analysis.R
Normal file
@@ -0,0 +1,221 @@
|
||||
library(data.table)
|
||||
library(Matrix)
|
||||
library(glmnet)
|
||||
library(xtable)
|
||||
library(methods)
|
||||
|
||||
predict.list <- NULL
|
||||
|
||||
if(!exists("top.ngram.matrix")){
|
||||
load("processed_data/top.ngram.matrix.RData")
|
||||
}
|
||||
|
||||
if(!exists("pred.descrip")){
|
||||
load("paper/data/prediction_descriptives.RData")
|
||||
covars <- pred.descrip$covars
|
||||
}
|
||||
|
||||
top.ngram.matrix <- data.table(top.ngram.matrix)
|
||||
setkey(top.ngram.matrix, eid)
|
||||
covars <- data.table(pred.descrip$covars)
|
||||
setkey(covars,eid)
|
||||
|
||||
# restrict to the overlap of the two datasets
|
||||
covars <- covars[covars$eid %in% top.ngram.matrix$eid,]
|
||||
|
||||
top.ngram.matrix <- top.ngram.matrix[top.ngram.matrix$eid %in%
|
||||
covars$eid,]
|
||||
|
||||
# rename the cited column in case it doesn't appear
|
||||
names(covars)[names(covars) == 'cited'] <- 'cited.x'
|
||||
|
||||
# then merge also to facilitate some manipulations below
|
||||
d <- merge(covars, top.ngram.matrix, by="eid", all=FALSE)
|
||||
|
||||
# Note that this duplicates some column names so X gets appended in a
|
||||
# few cases.
|
||||
|
||||
# construct model matrices
|
||||
x.controls <- sparse.model.matrix(cited.x ~ language.x +
|
||||
modal_country + month.x,
|
||||
data=d)[,-1]
|
||||
|
||||
x.aff <- sparse.model.matrix(cited.x ~ affiliation, data=d)[,-1]
|
||||
x.subj <- sparse.model.matrix(cited.x ~ subject.x, data=d)[,-1]
|
||||
x.venue <- sparse.model.matrix(cited.x ~ source_title, data=d)[,-1]
|
||||
|
||||
x.ngrams <- as.matrix(subset(top.ngram.matrix, select=-eid))
|
||||
x.ngrams <- as(x.ngrams, "sparseMatrix")
|
||||
|
||||
X <- cBind(x.controls, covars$year.x, covars$works.cited)
|
||||
X.aff <- cBind(X, x.aff)
|
||||
X.subj <- cBind(X.aff, x.subj)
|
||||
X.venue <- cBind(X.subj, x.venue)
|
||||
X.terms <- cBind(X.venue, x.ngrams)
|
||||
|
||||
Y <- covars$cited
|
||||
|
||||
### Hold-back sample for testing model performance later on:
|
||||
set.seed(20160719)
|
||||
holdback.index <- sample(nrow(X), round(nrow(X)*.1))
|
||||
|
||||
X.hold <- X[holdback.index,]
|
||||
X.hold.aff <- X.aff[holdback.index,]
|
||||
X.hold.subj <- X.subj[holdback.index,]
|
||||
X.hold.venue <- X.venue[holdback.index,]
|
||||
X.hold.terms <- X.terms[holdback.index,]
|
||||
Y.hold <- Y[holdback.index]
|
||||
|
||||
X.test <- X[-holdback.index,]
|
||||
X.test.aff <- X.aff[-holdback.index,]
|
||||
X.test.subj <- X.subj[-holdback.index,]
|
||||
X.test.venue <- X.venue[-holdback.index,]
|
||||
X.test.terms <- X.terms[-holdback.index,]
|
||||
Y.test <- Y[-holdback.index]
|
||||
|
||||
############### Models and prediction
|
||||
|
||||
set.seed(20160719)
|
||||
|
||||
m.con <- cv.glmnet(X.test, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
con.pred = predict(m.con, type="class", s="lambda.min",
|
||||
newx=X.hold)
|
||||
|
||||
m.aff <- cv.glmnet(X.test.aff, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
aff.pred = predict(m.aff, type="class", s="lambda.min",
|
||||
newx=X.hold.aff)
|
||||
|
||||
m.subj <- cv.glmnet(X.test.subj, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
subj.pred = predict(m.subj, type="class", s="lambda.min",
|
||||
newx=X.hold.subj)
|
||||
|
||||
m.venue <- cv.glmnet(X.test.venue, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
venue.pred = predict(m.venue, type="class", s="lambda.min",
|
||||
newx=X.hold.venue)
|
||||
|
||||
m.terms <- cv.glmnet(X.test.terms, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
terms.pred = predict(m.terms, type="class", s="lambda.min",
|
||||
newx=X.hold.terms)
|
||||
|
||||
##########
|
||||
# Compare test set predictions against held-back sample:
|
||||
|
||||
pred.df <- data.frame(cbind(con.pred, aff.pred, subj.pred,
|
||||
venue.pred, terms.pred))
|
||||
names(pred.df) <- c("Controls", "+ Affiliation", "+ Subject", "+ Venue",
|
||||
"+ Terms")
|
||||
|
||||
m.list <- list(m.con, m.aff, m.subj, m.venue, m.terms)
|
||||
|
||||
# collect:
|
||||
# df
|
||||
# percent.deviance
|
||||
# nonzero coefficients
|
||||
# prediction error
|
||||
|
||||
gen.m.summ.info <- function(model){
|
||||
df <- round(tail(model$glmnet.fit$df, 1),0)
|
||||
percent.dev <- round(tail(model$glmnet.fit$dev.ratio, 1),2)*100
|
||||
cv.error <- round(tail(model$cvm,1),2)*100
|
||||
# null.dev <- round(tail(model$glmnet.fit$nulldev),0)
|
||||
out <- c(df, percent.dev, cv.error)
|
||||
return(out)
|
||||
}
|
||||
|
||||
gen.class.err <- function(pred, test){
|
||||
props <- prop.table(table(pred, test))
|
||||
err.sum <- round(sum(props[1,2], props[2,1]),2)*100
|
||||
return(err.sum)
|
||||
}
|
||||
|
||||
|
||||
results.tab <- cbind(names(pred.df),data.frame(matrix(unlist(lapply(m.list,
|
||||
gen.m.summ.info)),
|
||||
byrow=T, nrow=5)))
|
||||
|
||||
results.tab$class.err <- sapply(pred.df, function(x) gen.class.err(x,
|
||||
Y.hold))
|
||||
|
||||
results.tab <- data.frame(lapply(results.tab, as.character))
|
||||
|
||||
|
||||
|
||||
names(results.tab) <- c("Model", "N features", "Deviance (%)",
|
||||
"CV error (%)", "Hold-back error (%)")
|
||||
|
||||
|
||||
print(xtable(results.tab,
|
||||
caption=
|
||||
"Summary of fitted models predicting any citations. The ``Model'' column describes which features were included. The N features column shows the number of features included in the prediction. ``Deviance'' summarizes the goodness of fit as a percentage of the total deviance accounted for by the model. ``CV error'' (cross-validation error) reports the prediction error rates of each model in the cross-validation procedure conducted as part of the parameter estimation process. ``Hold-back error'' shows the prediction error on a random 10 percent subset of the original dataset not included in any of the model estimation procedures.",
|
||||
label='tab:predict_models', align='llrrrr'),
|
||||
include.rownames=FALSE)
|
||||
|
||||
# Store the results:
|
||||
predict.list$results.tab <- results.tab
|
||||
|
||||
|
||||
|
||||
|
||||
############# Generate most salient coefficients
|
||||
nz.coefs <- data.frame( coef =
|
||||
colnames(X.test.terms)[which(
|
||||
coef(m.terms, s="lambda.min")
|
||||
!= 0)],
|
||||
type = "term",
|
||||
beta =
|
||||
coef(m.terms,
|
||||
s="lambda.min")[which(coef(m.terms,
|
||||
s="lambda.min")
|
||||
!= 0)])
|
||||
|
||||
nz.coefs$coef <- as.character(nz.coefs$coef)
|
||||
nz.coefs$type <- as.character(nz.coefs$type)
|
||||
nz.coefs <- nz.coefs[order(-abs(nz.coefs$beta)),]
|
||||
|
||||
# comparison:
|
||||
|
||||
#nz.coefs$type <- "terms"
|
||||
nz.coefs$type[grepl("(Intercept)", nz.coefs$coef)] <- NA
|
||||
nz.coefs$type[grepl("source_title", nz.coefs$coef)] <- "venue"
|
||||
nz.coefs$type[grepl("subject.x", nz.coefs$coef)] <- "subject"
|
||||
nz.coefs$type[grepl("affiliation", nz.coefs$coef)] <- "affiliation"
|
||||
nz.coefs$type[grepl("month.x", nz.coefs$coef)] <- "month"
|
||||
nz.coefs$type[grepl("modal_country", nz.coefs$coef)] <- "country"
|
||||
nz.coefs$type[grepl("language", nz.coefs$coef)] <- "language"
|
||||
nz.coefs$type[grepl("^20[0-9]{2}$", nz.coefs$coef)] <- "year"
|
||||
|
||||
|
||||
# cleanup
|
||||
nz.coefs$coef <- gsub("source_title", "", nz.coefs$coef)
|
||||
nz.coefs$coef <- gsub("subject.x", "", nz.coefs$coef)
|
||||
nz.coefs$coef <- gsub("affiliation","", nz.coefs$coef)
|
||||
nz.coefs$beta <- round(nz.coefs$beta, 3)
|
||||
names(nz.coefs) <- c("Feature", "Type", "Coefficient")
|
||||
|
||||
predict.list$nz.coefs <- nz.coefs
|
||||
|
||||
# table for all
|
||||
round(prop.table(table(nz.coefs$Type))*100, 2)
|
||||
|
||||
# for top subsets
|
||||
round(prop.table(table(nz.coefs$Type[1:700]))*100, 2)
|
||||
round(prop.table(table(nz.coefs$Type[1:200]))*100, 2)
|
||||
round(prop.table(table(nz.coefs$Type[1:100]))*100, 2)
|
||||
|
||||
print(xtable(
|
||||
as.matrix(head(nz.coefs, 10)),
|
||||
label='tab:nzcoefs',
|
||||
caption='Feature, variable type, and beta value for top 100 non-zero coefficients estimated by the best fitting model with all features included.',
|
||||
align='lllr'
|
||||
), include.rownames=FALSE)
|
||||
|
||||
|
||||
# output
|
||||
save(predict.list, file="paper/data/prediction.RData")
|
||||
|
||||
|
||||
13
code/prediction/utils.R
Normal file
13
code/prediction/utils.R
Normal file
@@ -0,0 +1,13 @@
|
||||
|
||||
# Use this to check for underpopulated cells
|
||||
gen.counts <- function(df, c.var){
|
||||
tapply(df[,"eid"], c.var, function(x) length(unique(x)))
|
||||
}
|
||||
|
||||
# use this to remove underpopulated cells
|
||||
restrict <- function(df, c.var, c.min){
|
||||
var.counts <- gen.counts(df, c.var)
|
||||
out.df <- df[c.var %in% names(var.counts[var.counts >
|
||||
c.min]),]
|
||||
return(out.df)
|
||||
}
|
||||
Reference in New Issue
Block a user