initial import of material for public archive into git

We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00
commit dd420c77de
41 changed files with 7069 additions and 0 deletions
--- a/code/bibliometrics/00_citation_network_analysis.ipynb
+++ b/code/bibliometrics/00_citation_network_analysis.ipynb
--- a/code/bibliometrics/00_citation_network_analysis.py
+++ b/code/bibliometrics/00_citation_network_analysis.py
@@ -0,0 +1,232 @@
+# coding: utf-8
+# # Import data and get things setup
+
+import random
+random.seed(9001)
+
+# import code to write r modules and create our variable we'll write to
+import rpy2.robjects as robjects
+from rpy2.robjects import pandas2ri
+pandas2ri.activate()
+
+r = {}
+def remember(name, x):
+    r[name] = x
+
+# load in modules we'll need for analysis
+import subprocess
+import csv
+from igraph import *
+import pandas as pd
+import numpy as np
+import re
+
+# grab the largest connected compontent with a little function
+def get_largest_component(g):
+    g_components = g.components(mode="WEAK")
+    max_size = max(g_components.sizes())
+    for g_tmp in g_components.subgraphs():
+        if g_tmp.vcount() == max_size:
+            return(g_tmp)
+
+# look the full edgelist into igraph
+def edge_list_iter(df):
+    for i, row in df.iterrows():
+        yield (row['from'], row['to'])
+
+# list top 5 journals for each of the clusters
+def top_journals_for_clusters(clu):
+    articles_tmp = pd.merge(clu, articles[['eid', 'source_title']])
+    
+    output = pd.DataFrame()
+    for cid in articles_tmp['cluster'].unique():
+        journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5)
+        tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts })        
+        output = output.append(tmp)
+
+    output = output.reset_index()
+    output = output.rename(columns = {'index' : "journal"})
+    return(output)
+
+def infomap_edgelist(g, edgelist_filename, directed=True):
+    nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index, 
+                                'eid' : v['name']} for v in g.vs ])
+
+    # write out the edgelist to an external file so we can call infomap on it
+    with open("code/bibliometrics/" + edgelist_filename + ".txt", 'w') as f:
+        for e in g.es:
+            if e.source != e.target:
+                if 'weight' in e.attributes():
+                    print("{}\t{}\t{}".format(e.source, e.target, e['weight']), file=f)
+                else:
+                    print("{}\t{}".format(e.source, e.target), file=f)
+
+                    
+    # run the external program to generate the infomap clustering
+    infomap_cmdline = ["code/bibliometrics/infomap/Infomap", "code/bibliometrics/" + edgelist_filename + ".txt", "code/bibliometrics/output_dir -z --map --clu --tree"]
+    if directed:
+        infomap_cmdline.append("-d")
+    subprocess.call(infomap_cmdline)
+
+    # load up the clu data
+    clu = pd.read_csv("code/bibliometrics/output_dir/" + edgelist_filename + ".clu",
+                      header=None, comment="#", delim_whitespace=True)
+    clu.columns = ['node_infomap', 'cluster', 'flow']
+    
+    return pd.merge(clu, nodes_tmp, on="node_infomap")
+
+
+def write_graphml(g, clu, graphml_filename):
+    clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap')
+    g.vs["cluster"] =  clu["cluster"].tolist()
+    g.write_graphml("code/bibliometrics/" + graphml_filename)
+
+
+# load article data
+articles = pd.read_csv("processed_data/abstracts.tsv", delimiter="\t")
+
+# # network for just the central "social media" set
+
+# this contains the list of all INCOMING citations to for paper in the original set
+raw_edgelist = pd.read_csv("processed_data/social_media_edgelist.txt", delimiter="\t")
+
+g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True)
+
+
+g_sm = get_largest_component(g_sm_all)
+g_sm = g_sm.simplify()
+
+g_sm_clu = infomap_edgelist(g_sm, "sm_edgelist_infomap", directed=True)
+
+g_sm_clu['cluster'].value_counts()
+
+write_graphml(g_sm, g_sm_clu, "g_sm.graphml")
+
+
+# # larger network that contains the incoming cites to citing articles
+
+# this contains the list of all INCOMING citations to everything in the original set
+# plus every INCOMING citation to every paper that cites one of those papers
+raw_edgelist_files = ["processed_data/citation_edgelist.txt",
+                      "processed_data/social_media_edgelist.txt"]
+combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter="\t") for x in raw_edgelist_files])
+
+
+g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True)
+
+g_full = get_largest_component(g_full_all)
+g_full = g_full.simplify()
+
+
+g_full_clu = infomap_edgelist(g_full, "citation_edglist_infomap", directed=True)
+
+
+g_full_clu['cluster'].value_counts()
+
+top_journals_for_clusters(g_full_clu)
+
+write_graphml(g_full, g_full_clu, "g_full.graphml")
+
+
+# # create the meta-network of connections between clusters
+
+edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[["eid", "cluster"]], how="inner", left_on="to", right_on="eid")
+edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'})
+edgelist_tmp.drop('eid', 1, inplace=True)
+                                          
+edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[["eid", "cluster"]], how="inner", left_on="from", right_on="eid")
+edgelist_tmp = edgelist_tmp.rename(columns={"cluster" : 'from_cluster'})
+edgelist_tmp.drop('eid', 1, inplace=True)
+
+edgelist_tmp = edgelist_tmp[["to_cluster", "from_cluster"]]
+edgelist_tmp = edgelist_tmp[edgelist_tmp["to_cluster"] != edgelist_tmp["from_cluster"]]
+
+cluster_edgelist = pd.crosstab(edgelist_tmp["to_cluster"], edgelist_tmp["from_cluster"])
+cluster_edgelist["to_cluster"] = cluster_edgelist.index
+
+cluster_edgelist = pd.melt(cluster_edgelist, id_vars=["to_cluster"])
+cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']]
+
+remember("cluster_edgelist", cluster_edgelist)
+
+top_clusters = g_sm_clu["cluster"].value_counts().head(6).index
+
+# write the edgelist for the total number of clusters (currently 1-6)
+cluster_edgelist_output = cluster_edgelist[(cluster_edgelist["to_cluster"].isin(top_clusters)) &
+                                           (cluster_edgelist["from_cluster"].isin(top_clusters))]
+
+cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output["value"] > 0]
+
+g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[["from_cluster", "to_cluster"]].values], directed=True)
+g_cluster.es["weight"] = cluster_edgelist_output["value"].tolist()
+
+# assign the number of total articles as an attribute for each node
+g_cluster.vs["papers"] = g_sm_clu["cluster"].value_counts()[[x["name"] for x in g_cluster.vs]].tolist()
+
+g_cluster.write_graphml("code/bibliometrics/clusters.graphml")
+
+# # create network stats for tables (overall and within clusters)
+
+def create_network_stats(g):
+    network_stats = pd.DataFrame({'eid' : g.vs['name'],
+                                  'eig_cent' : g.eigenvector_centrality(),
+                                  'indegree' : g.indegree(),
+                                  'betweenness' : g.betweenness()})
+
+    network_stats = pd.merge(network_stats,
+                             articles[['eid', 'title', 'source_title']],
+                             how="inner")
+    return network_stats
+
+network_stats = create_network_stats(g_full)
+
+network_stats.sort_values("indegree", ascending=False).head(4)
+
+network_stats.sort_values("eig_cent", ascending=False).head(4)
+
+network_stats.sort_values("betweenness", ascending=False).head(4)
+
+# # things to store
+remember('total_articles', articles.shape[0])
+
+# total number of citations in the sm dataset
+remember('sm_citations', raw_edgelist.shape[0])
+
+remember('sm_citing', len(raw_edgelist["from"].unique()))
+
+# the number of articles in the original dataset that have any INCOMING citations
+remember('sm_cited', len(raw_edgelist["to"].unique()))
+
+# total number of citations in the sm dataset
+remember('all_citations', combo_raw_edgelist.shape[0])
+
+remember('all_citing', len(combo_raw_edgelist["from"].unique()))
+
+# the number of articles in the original dataset that have any INCOMING citations
+remember('all_cited', len(combo_raw_edgelist["to"].unique()))
+
+remember('g_sm_clusters', g_sm_clu[["eid", "cluster"]])
+
+sorted(r.keys())
+
+#save the r function to rdata file
+def save_to_r(r_dict, filename="output.RData"):
+    for var_name, x in r.items():
+        var_name = var_name.replace('_', '.')
+        if type(x) == np.int64:
+            x = np.asscalar(x)
+        
+        if type(x) == pd.DataFrame:
+            rx = pandas2ri.py2ri(x)
+        else:
+            rx = x
+        
+        robjects.r.assign(var_name, x)
+
+        # create a new variable called in R
+    robjects.r("r <- sapply(ls(), function (x) {eval(parse(text=x))})")
+    robjects.r('save("r", file="{}")'.format(filename))
+    robjects.r("rm(list=ls())")
+    
+save_to_r(r, "paper/data/network_data.RData")
+
--- a/code/bibliometrics/clusters.gephi
+++ b/code/bibliometrics/clusters.gephi
--- a/code/bibliometrics/g_sm.gephi
+++ b/code/bibliometrics/g_sm.gephi
--- a/code/data_collection/00_get_search_results.py
+++ b/code/data_collection/00_get_search_results.py
@@ -0,0 +1,24 @@
+import argparse
+from request_functions import *
+
+'''
+This script takes in a search query and an output file. It queries the scopus API to find all papers that match the search query, and saves them to the output file.
+
+Unlike some of the other scripts in this directory, it does not try to determine the state - if you restart the script, it will start over and blow away whatever you had saved before.
+'''
+
+years = range(2004, 2017)
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Output JSON of all articles matching search query')
+    parser.add_argument('-q', help='Search query', required=True)
+    parser.add_argument('-o', help='Where to append JSON results')
+    args = parser.parse_args()
+
+    with open(args.o, 'w') as out_file:
+        for year in years:
+            get_search_results(args.q, out_file, year=year)
+
+if __name__ == '__main__':
+    main()
--- a/code/data_collection/01_get_abstracts.py
+++ b/code/data_collection/01_get_abstracts.py
@@ -0,0 +1,56 @@
+from request_functions import *
+import argparse
+import json
+import subprocess
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Output JSON of abstracts and bibliography of all articles passed in.')
+    parser.add_argument('-i', help='JSON file which includes eids')
+    parser.add_argument('--eid', '-e', help='Single eid')
+    parser.add_argument('-o', help='Where to append JSON results')
+    args = parser.parse_args()
+
+    if args.eid:
+        eids = [args.eid]
+    elif args.i:
+        with open(args.i, 'r') as f:
+            eids = [json.loads(line)['eid'] for line in f]
+    else:
+        print('Need to either pass in an eid or a json file with eids')
+
+    # If the script gets interrupted, we need to start where we left off
+    try:
+        errors = []
+        with open(args.o, 'r') as f:
+            completed_eids = []
+            for line in f:
+                try:
+                    result = json.loads(line)
+                    completed_eids.append(result['abstracts-retrieval-response']['coredata']['eid'])
+                except ValueError:
+                    errors.append(line)
+    except IOError as e:
+        completed_eids = []
+
+
+    print('{} completed eids'.format(len(completed_eids)))
+    with open(args.o, 'a') as out_file:
+            for eid in eids:
+                if eid not in completed_eids:
+                    result = get_abstract(eid)
+                    if result:
+                        out_file.write(result)
+                        out_file.write('\n')
+                    else:
+                        errors.append(eid)
+
+    if len(errors) > 0:
+        with open('raw_data/missing_eids.json', 'a') as l:
+            # Add the bad lines from the output file
+            (l.write(e) for e in errors)
+
+
+if __name__ == '__main__':
+    main()
--- a/code/data_collection/02_get_cited_by.py
+++ b/code/data_collection/02_get_cited_by.py
@@ -0,0 +1,43 @@
+from request_functions import *
+import argparse
+import json
+import subprocess
+from os import remove
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
+    parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
+    parser.add_argument('-o', help='Where to append JSON results')
+    args = parser.parse_args()
+
+    with open(args.i, 'r') as f:
+        # Make a dictionary of eid:citation count for each line in the file
+        eids = {}
+        for line in f:
+            l = json.loads(line)
+            eids[l['eid']] = l['citedby-count']
+
+    # If the script gets interrupted, we need to start where we left off
+    try:
+        # Open the output file, and grab all of the eids which are already completed
+        with open(args.o, 'r') as f:
+            completed_eids = [json.loads(l)['parent_eid'] for l in f]
+        # Remove those which came from the last id (since we may have missed some)
+        if len(completed_eids) > 0:
+            last_eid = completed_eids.pop()
+            # Remove all of the lines which came from the last eid
+            subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
+            # Hopefully everything has worked out, because here we blow away the backup
+            remove('{}.bak'.format(args.o))
+    except IOError:
+        # If the file doesn't exist, then there aren't any completed eids
+        completed_eids = []
+
+    with open(args.o, 'a') as out_file:
+        for eid, citation_count in eids.items():
+            if citation_count != '0' and eid not in completed_eids:
+                get_cited_by(eid, out_file)
+
+if __name__ == '__main__':
+    main()
--- a/code/data_collection/request_functions.py
+++ b/code/data_collection/request_functions.py
@@ -0,0 +1,166 @@
+import requests
+from datetime import datetime
+from scopus_api import key as API_KEY
+import json
+import os
+import logging
+import re
+
+logging.basicConfig(level=logging.DEBUG)
+
+RETRY_COUNT = 5
+TIMEOUT_SECS = 10
+
+# Initialize a global session object
+s = requests.Session()
+s.headers.update({'X-ELS-APIKey' : API_KEY,
+            'X-ELS-ResourceVersion' : 'XOCS',
+            'Accept' : 'application/json'})
+
+def get_token(location_id = None):
+    '''Given a location_id, gets an authentication token'''
+    print('Getting a token')
+    api_resource = 'http://api.elsevier.com/authenticate'
+    # Parameters
+    payload = {'platform':'SCOPUS',
+            'choice': location_id}
+    r = s.get(api_resource, params = payload)
+    r.raise_for_status()
+    s.headers['X-ELS-AuthToken'] = r.json()['authenticate-response']['authtoken']
+
+def get_search_results(query, output_file, results_per_call = 200,
+        tot_results=None, year=None, sort='+title', citation_call=False):
+    '''Handles getting search results. Takes a query and an output
+    file. Writes as many of the search results as possible to the
+    output file as JSON dictionaries, one per line.'''
+    result_set = []
+    results_added = 0
+    def curr_call(start=0, count=results_per_call):
+        '''Shorthand for the current call: DRY'''
+        return make_search_call(query, start=start,
+            count=count, year=year, sort=sort)
+    if tot_results == None:
+        # Call the API initially to figure out how many results there are, and write the results
+        initial_results = curr_call(count=results_per_call)
+        tot_results = int(initial_results['search-results']['opensearch:totalResults'])
+        result_set.append((initial_results, sort))
+        results_added += results_per_call
+    logging.debug("Total results: {}".format(tot_results))
+
+    if tot_results == 0:
+        return None
+    if tot_results > 5000:
+            # If this is just one year, we can't get any more granular, and
+            # we need to return what we can.
+        if tot_results > 10000:
+            print("{} results for {}. We can only retrieve 10,000".format(tot_results, year))
+            first_half = last_half = 5000
+        else:
+            # Get half, and correct for odd # of results
+            first_half = tot_results//2 + tot_results % 2
+            last_half = tot_results//2
+        # Break the search into the first half and the bottom half of results.
+        get_search_results(query, output_file,
+               year = year,
+               tot_results=first_half)
+         # Get the other half
+        get_search_results(query, output_file,
+                year = year,
+                tot_results = last_half, sort='-title')
+# If there are 5000 or fewer to retrieve, then get them
+    else:
+        logging.debug('Retrieving {} results'.format(tot_results))
+        # As long as there are more citations to retrieve, then do it, and write
+        # them to the file
+        while results_added < tot_results:
+            # If we are near the end, then only get as many results as are left.
+            to_retrieve = min(results_per_call, (tot_results - results_added))
+            curr_results = curr_call(start=results_added, count=to_retrieve)
+            result_set.append((curr_results, sort))
+            results_added += results_per_call
+    # This is hacky, but I'm doing it
+    # If this is a citation call, then construct metadata to be written with the result
+    if citation_call:
+        metadata = {'parent_eid': re.match(r'refeid\((.*)\)', query).group(1)}
+    else:
+        metadata = {}
+    write_results(result_set, output_file, metadata)
+
+def write_results(result_set, output_file, metadata={}):
+    for x in result_set:
+        search_json = x[0]
+        to_reverse = x[1].startswith('-')
+        try:
+            results = [x for x in search_json['search-results']['entry']]
+        except KeyError:
+            raise
+        if to_reverse:
+            results = results[::-1]
+        for x in results:
+            for k, v in metadata.items():
+                x[k] = v
+            json.dump(x, output_file)
+            output_file.write('\n')
+
+
+def make_search_call(query, start=0, count=200,
+        sort='+title', year=None,
+        retry_limit = RETRY_COUNT,
+        timeout_secs = TIMEOUT_SECS):
+    api_resource = "https://api.elsevier.com/content/search/scopus"
+    # Parameters
+    payload = {'query':query,
+            'count':count,
+            'start':start,
+            'sort': sort,
+            'date': year}
+    for _ in range(retry_limit):
+        try:
+            r = s.get(api_resource,
+                    params = payload,
+                    timeout = timeout_secs)
+            logging.debug(r.url)
+            if r.status_code == 401:
+                get_token()
+                continue
+            if r.status_code == 400:
+                raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
+            break
+        except requests.exceptions.Timeout:
+            pass
+    else:
+        raise requests.exceptions.Timeout('Timeout Error')
+
+    r.raise_for_status()
+    return r.json()
+
+
+def get_cited_by(eid, output_file):
+    return get_search_results('refeid({})'.format(eid), output_file, results_per_call=200,
+            citation_call = True)
+
+
+def get_abstract(eid, retry_limit = RETRY_COUNT,
+        timeout_secs = TIMEOUT_SECS):
+    api_resource = "http://api.elsevier.com/content/abstract/eid/{}".format(eid)
+    # Parameters
+    payload = {}
+    for _ in range(retry_limit):
+        try:
+            r = s.get(api_resource,
+                    params = payload,
+                    timeout = timeout_secs)
+            if r.status_code == 401:
+                get_token()
+                continue
+            if r.status_code == 400:
+                raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
+            break
+        except requests.exceptions.Timeout:
+            pass
+    else:
+        raise requests.exceptions.Timeout('Timeout Error')
+    if r.status_code == 404:
+        return None
+    r.raise_for_status()
+    return r.content.decode('utf-8')
--- a/code/data_collection/scopus_api.py
+++ b/code/data_collection/scopus_api.py
@@ -0,0 +1 @@
+key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
--- a/code/data_processing/00_abstracts_to_tsv.py
+++ b/code/data_processing/00_abstracts_to_tsv.py
@@ -0,0 +1,177 @@
+from collections import Counter
+from datetime import datetime
+import json
+import argparse
+import csv
+import random
+
+random.seed(2017)
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Change a big ugly abstract file to a nice CSV')
+    parser.add_argument('-i', help='Abstract file')
+    parser.add_argument('-o', help='TSV output file')
+    args = parser.parse_args()
+
+    with open(args.i, 'r') as i:
+        with open(args.o, 'w') as o:
+            # Have to get the field names
+            first_line = clean_abstract(json.loads(next(i)))
+            fieldnames = first_line.keys()
+            output = csv.DictWriter(o, fieldnames, delimiter='\t')
+            output.writeheader()
+            output.writerow(first_line)
+            for line in i:
+                output.writerow(clean_abstract(json.loads(line)))
+
+
+def clean_abstract(json_response):
+    result = json_response['abstracts-retrieval-response']
+    head = result['item']['bibrecord']['head']
+    try:
+        attributes = {
+                'modal_country': get_country(head),
+                'abstract' : get_abstract(result),
+                'title' : get_title(result),
+                'source_title': get_source_title(head),
+                'language': result['language']['@xml:lang'],
+                'first_ASJC_subject_area': get_subject(result, '$'),
+                'first_ASJC_classification': get_subject(result, '@code'),
+                'first_CPX_class': get_CPX_class(head, 'classification-description'),
+                'date': to_date(result['coredata']['prism:coverDate']),
+                'aggregation_type' : if_exists('prism:aggregationType',result['coredata'],else_val='NA'),
+                'eid' : result['coredata']['eid'],
+                'cited_by_count': result['coredata']['citedby-count'],
+                'num_citations': get_citation_count(result)
+                }
+    except KeyError:
+        raise
+    except TypeError:
+       # print(result)
+        raise
+    return attributes
+
+def get_citation_count(result):
+    try:
+        return result['item']['bibrecord']['tail']['bibliography']['@refcount']
+    except TypeError:
+        return None
+
+def get_title(result):
+    try:
+        return result['coredata']['dc:title']
+    except KeyError:
+        raise
+
+
+def get_source_title(head):
+    try:
+        return head['source']['sourcetitle']
+    except KeyError:
+        raise
+
+def get_abstract(result):
+    try:
+        abstract = result['coredata']['dc:description']
+        abstract = abstract.replace('\n',' ')
+        return abstract
+    except KeyError:
+        return None
+
+def get_auth_names(head):
+    try:
+        auth_info = [x['author'] for x in make_list(head['author-group'])]
+    except KeyError:
+        print(head)
+    auth_names = []
+    for auth_group in auth_info:
+        for auth in make_list(auth_group):
+            auth_names.append('{} {}'.format(
+                auth['preferred-name']['ce:given-name'],
+                auth['preferred-name']['ce:surname']))
+    return auth_names
+
+def get_country(head):
+    all_countries = get_aff_info(head, 'country')
+    if all_countries:
+        # Find the mode. If there's more than one, choose randomly
+        modes = Counter
+        s = set(all_countries)
+        max_count = max([all_countries.count(x) for x in s])
+        modes = [x for x in s if all_countries.count(x) == max_count]
+        return random.choice(modes)
+
+def get_aff_info(head, affiliation_key):
+    aff_info = []
+    try:
+        authors = make_list(head['author-group'])
+    except KeyError:
+        return None
+    for x in authors:
+        try:
+            num_auth = len(make_list(x['author']))
+        except KeyError:
+            # Apparently there are things called "collaborations", which don't have affiliation info.
+            # I'm just skipping them
+            continue
+        except TypeError:
+            # And apparently "None" appears in the author list for no reason. :)
+            continue
+        try:
+            curr_inst = x['affiliation'][affiliation_key]
+            # Add one instance for each author from this institution
+            aff_info += [curr_inst] * num_auth
+        except KeyError:
+            # If there isn't affiliation info for these authors, return empty str
+            aff_info += [''] * num_auth
+    return aff_info
+
+def get_keywords(head):
+    cite_info = head['citation-info']
+    try:
+        keywords = [x for x in
+                make_list(cite_info['author-keywords']['author-keyword'])]
+        # When there's only one keyword, it's a string. Otherwise, we will
+        # have a list of dictionaries
+        if len(keywords) == 1:
+            return keywords
+        else:
+            return [x['$'] for x in keywords]
+    except KeyError:
+        return None
+
+def get_subject(result, key):
+    try:
+        return [x[key] for x in make_list(result['subject-areas']['subject-area'])][0]
+    except KeyError:
+        print(result)
+        raise
+
+def get_CPX_class(head, class_key):
+    try:
+        for x in head['enhancement']['classificationgroup']['classifications']:
+            if x['@type'] == 'CPXCLASS':
+                try:
+                    return [y[class_key] for y in make_list(x['classification'])][0]
+                except (KeyError, TypeError):
+                    return None
+    except KeyError:
+        print(head['enhancement']['classificationgroup'])
+        raise
+
+def to_date(date_string):
+    return datetime.strptime(date_string, '%Y-%m-%d')
+
+
+def if_exists(key, dictionary, else_val = None):
+    try:
+        return dictionary[key]
+    except KeyError:
+        return else_val
+
+def make_list(list_or_dict):
+    return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
+
+if __name__ == '__main__':
+    main()
--- a/code/data_processing/01_cited_by_to_edgelist.py
+++ b/code/data_processing/01_cited_by_to_edgelist.py
@@ -0,0 +1,25 @@
+from datetime import datetime
+import json
+import argparse
+import csv
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Make a citation network from the cited_by json')
+    parser.add_argument('-i', help='Cited_by file')
+    parser.add_argument('-o', help='TSV output file')
+    args = parser.parse_args()
+
+    with open(args.i, 'r') as i:
+        with open(args.o, 'w') as o:
+            output = csv.writer(o, delimiter = '\t')
+            output.writerow(['to','from', 'date'])
+            for line in i:
+                line = json.loads(line)
+                output.writerow([line['parent_eid'], line['eid'], line['prism:coverDate']])
+
+
+if __name__ == '__main__':
+    main()
+
--- a/code/data_processing/02_filter_edgelist.py
+++ b/code/data_processing/02_filter_edgelist.py
@@ -0,0 +1,29 @@
+import argparse
+import csv
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Take the edgelist, and reduce it to just the papers which are in our search')
+    parser.add_argument('-i', help='Full edgelist file')
+    parser.add_argument('-o', help='Edgelist output file')
+    args = parser.parse_args()
+
+    with open(args.i, 'r') as in_file:
+        i = csv.reader(in_file, delimiter= '\t')
+        next(i) # Discard header
+        # Get the list of nodes to keep
+        nodes = set([x[0] for x in i])
+        in_file.seek(0) # Start over at the beginning
+        with open(args.o, 'w') as o:
+            output = csv.writer(o, delimiter = '\t')
+            output.writerow(['to','from', 'date'])
+            for line in i:
+                # If the both items are in nodes, then keep the line
+                if line[1] in nodes:
+                    output.writerow(line)
+
+
+if __name__ == '__main__':
+    main()
+
--- a/code/data_processing/03_make_paper_aff_table.py
+++ b/code/data_processing/03_make_paper_aff_table.py
@@ -0,0 +1,62 @@
+import json
+import argparse
+import csv
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate paper to affiliation mapping file from abstracts file')
+    parser.add_argument('-i', help='Abstract file')
+    parser.add_argument('-o', help='TSV output file')
+    args = parser.parse_args()
+
+    with open(args.i, 'r') as i:
+        with open(args.o, 'w') as o:
+            output = csv.writer(o, delimiter='\t')
+            output.writerow(['paper_eid','affiliation_id',
+                'organization','country'])
+            for line in i:
+                entries = get_entries(line)
+                for entry in entries:
+                    output.writerow(entry)
+
+
+def get_entries(l):
+    json_response = json.loads(l)
+    full = json_response['abstracts-retrieval-response']
+    head = full['item']['bibrecord']['head']
+    eid = full['coredata']['eid']
+    countries = get_aff_info(head, 'country')
+    affiliation_ids = get_aff_info(head, '@afid')
+    org_names = get_aff_info(head, 'organization')
+    if countries:
+        result = [[eid, affiliation_ids[i], org_names[i], countries[i]]
+                for i in range(len(countries))]
+        return result
+    return []
+
+def get_aff_info(head, affiliation_key):
+    aff_info = []
+    try:
+        affiliations = make_list(head['author-group'])
+    except KeyError:
+        return None
+    for x in affiliations:
+        if x is None:
+            continue
+        try:
+            curr_inst = x['affiliation'][affiliation_key]
+            # May return a string or a list. If it's a list, then 
+            # return the final value of that list (This is the base organization)
+            if isinstance(curr_inst, list):
+                curr_inst = [x['$'] for x in curr_inst][-1]
+            aff_info.append(curr_inst)
+        except KeyError:
+            # If there isn't affiliation info for these authors, return empty str
+            aff_info.append('')
+    return aff_info
+
+def make_list(list_or_dict):
+    return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
+
+if __name__ == '__main__':
+    main()
--- a/code/data_processing/04_make_paper_subject_table.py
+++ b/code/data_processing/04_make_paper_subject_table.py
@@ -0,0 +1,50 @@
+import json
+import argparse
+import csv
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate paper to subject mapping file from abstracts file')
+    parser.add_argument('-i', help='Abstract file')
+    parser.add_argument('-o', help='TSV output file')
+    args = parser.parse_args()
+
+    with open(args.i, 'r') as i:
+        with open(args.o, 'w') as o:
+            output = csv.writer(o, delimiter='\t')
+            output.writerow(['paper_eid','subject',
+                'subject_code'])
+            for line in i:
+                entries = get_entries(line)
+                for entry in entries:
+                    output.writerow(entry)
+
+
+def get_entries(l):
+    json_response = json.loads(l)
+    full = json_response['abstracts-retrieval-response']
+    eid = full['coredata']['eid']
+    subjects = get_subjects(full)
+    # Prepend the eid, and return the subjects
+    return [[eid,s[0],s[1]] for s in subjects]
+    return []
+
+
+def get_subjects(abstract_response):
+    try:
+        subject_info = make_list(abstract_response['subject-areas']['subject-area'])
+    except KeyError:
+        print(result)
+        raise
+    result = []
+    for s in subject_info:
+        # Get the subject name and code, and append them
+        result.append([s['$'],s['@code']])
+    return result
+
+
+def make_list(list_or_dict):
+    return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
+
+if __name__ == '__main__':
+    main()
--- a/code/data_processing/05_save_descriptives.R
+++ b/code/data_processing/05_save_descriptives.R
@@ -0,0 +1,17 @@
+df = read.csv('processed_data/abstracts.tsv',sep='\t', strip.white=TRUE)
+df['date'] = as.Date(df$date)
+df$modal_country[df['modal_country'] == ''] <- NA
+df['year'] = format(df['date'],'%Y')
+
+abstracts <- df[df['abstract'] != '',c('eid','abstract')]
+# Creates a vector of word counts, based on counting all of the groups of alphanumeric characters
+word_count <- apply(abstracts, 1, function(x) sapply(gregexpr("[[:alnum:]]+", x['abstract']), function(x) sum(x > 0)))
+
+s = read.csv('processed_data/paper_subject_table.tsv', sep='\t')
+full <- merge(df,s, by.x = 'eid', by.y = 'paper_eid')
+
+# zero these out before we save them so we don't save all of the abstracts.
+full['abstract'] <- NULL
+df['abstract'] <- NULL
+
+save(df, abstracts, s, full, word_count, file="paper/data/orig_data_sets.RData")
--- a/code/data_processing/make_network.py
+++ b/code/data_processing/make_network.py
@@ -0,0 +1,26 @@
+'''Takes a CSV of retrieved articles, and creates an igraph
+network from them (not even close to done)'''
+
+class CitationNetwork(igraph.Graph):
+    def __init__(self, network_type):
+        super().__init__(directed=True)
+        self.temp_edges = []
+        self.temp_vertices = []
+        self.network_type = network_type
+
+    def add_vertices(self, to_node, from_nodes):
+        self.temp_vertices += [[from_node, to_node] for from_node in from_nodes]
+
+    def make_network(self):
+        # Get the unique set of nodes, and add them.
+        nodes = set([v for v in self.temp_vertices if v['eid'] not in self.vs['name']])
+        nodes = sorted(nodes)
+        self.add_vertices(nodes)
+        self.add_edges(self.temp_edges)
+        self.es['weight'] = 1
+
+    def collapse_weights(self):
+        self.simplify(combine_edges={"weight": "sum"})
+
+    def add_citations(eid, citations):
+        self.retrieved_eids.append(eid)
--- a/code/prediction/00_ngram_extraction.py
+++ b/code/prediction/00_ngram_extraction.py
@@ -0,0 +1,89 @@
+from time import time
+
+from sklearn.feature_extraction.text import CountVectorizer
+import csv
+import argparse
+
+n_features = 100000 # Gets the top n_features terms
+n_samples = None # Enter an integer here for testing, so it doesn't take so long
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts')
+    parser.add_argument('-i', help='Location of the abstracts file',
+            default='processed_data/abstracts.tsv')
+    parser.add_argument('-o', help='Location of the output file',
+            default='processed_data/ngram_table.csv')
+    parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams',
+        default=3)
+
+    args = parser.parse_args()
+
+    print("Loading dataset...")
+    t0 = time()
+    doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples)
+    print("done in %0.3fs." % (time() - t0))
+
+    # Write the header
+    write_header(args.o)
+
+    bags_o_words = get_counts(data_samples, n_features, args.n)
+    write_output(doc_ids, bags_o_words, args.o)
+
+def get_counts(abstracts, n_features, ngram_max):
+    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
+                                    max_features=n_features,
+                                    stop_words='english',
+                                    ngram_range = (1,ngram_max))
+    t0 = time()
+    tf = tf_vectorizer.fit_transform(abstracts)
+    print("done in %0.3fs." % (time() - t0))
+
+    terms = tf_vectorizer.get_feature_names()
+    freqs = tf.toarray()
+    bags_o_words = to_bags_o_words(terms, freqs)
+    return bags_o_words
+
+
+def write_header(out_file):
+    with open(out_file, 'w') as o_f:
+        out = csv.writer(o_f)
+        out.writerow(['document_id','term','frequency'])
+
+def to_bags_o_words(terms, freqs):
+    '''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document.
+    The format of the dictionaries is term:count within that document.
+    '''
+    result = []
+    for d in freqs:
+        curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 }
+        result.append(curr_result)
+    return result
+
+def write_output(ids, bags_o_words, out_file):
+    with open(out_file, 'a') as o_f:
+        out = csv.writer(o_f)
+        for i, doc in enumerate(bags_o_words):
+            for k,v in doc.items():
+                # For each term and count, output a row, together with the document id
+                out.writerow([ids[i],k,v])
+
+def get_ids_and_abstracts(fn, length_limit):
+    with open(fn, 'r') as f:
+        in_csv = csv.DictReader(f, delimiter='\t')
+        abstracts = []
+        ids = []
+        i = 1
+        for r in in_csv:
+            try:
+                abstracts.append(r['abstract'])
+                ids.append(r['eid'])
+            except KeyError:
+                print(r)
+            if length_limit and  i > length_limit:
+                break
+            i += 1
+    return ids, abstracts
+
+if __name__ == '__main__':
+    main()
--- a/code/prediction/01-build_control_variables.R
+++ b/code/prediction/01-build_control_variables.R
@@ -0,0 +1,89 @@
+source("code/prediction/utils.R")
+
+# use this to store things for use in the paper
+pred.descrip <- NULL
+
+abstracts <- read.delim("processed_data/abstracts.tsv", header=TRUE,
+                stringsAsFactors=FALSE, sep="\t")
+
+abstracts <- subset(abstracts, select = -abstract)
+
+abstracts <- abstracts[abstracts$aggregation_type != "Trade Journal" &
+                     is.na(abstracts$aggregation_type) == FALSE, ]
+
+names(abstracts)[names(abstracts) == 'num_citations'] <- 'works_cited'
+abstracts$works_cited[is.na(abstracts$works_cited) == TRUE] <- 0
+
+# affiliations
+affiliations <- read.delim("processed_data/paper_aff_table.tsv",
+                           header=TRUE, stringsAsFactors=FALSE,
+                           sep="\t")
+
+# eliminate missing values
+affiliations <- affiliations[!is.na(affiliations$affiliation_id) &
+                             affiliations$organization != "", ]
+
+
+remap.affiliations <- function(aff.id,
+                               aff.df = affiliations){
+    org.modal <- names(tail(sort(table(affiliations$organization[
+        affiliations$affiliation_id == aff.id])),1))
+    return(org.modal)
+}
+
+affiliations$organization <- sapply(affiliations$affiliation_id, remap.affiliations)
+
+affiliations <- subset(affiliations, select = c(paper_eid,
+                           organization))
+names(affiliations) <- c("eid", "affiliation")
+
+# need to remove repeat affiliations
+affiliations <- affiliations[duplicated(affiliations$eid) == FALSE,]
+
+
+######################################
+d <- abstracts[, c("eid", "language", "modal_country",
+                   "source_title", "works_cited")]
+
+# dichotomous dependent variable
+d$cited <- abstracts$cited_by_count > 0
+
+
+# store this here for use in the paper before we run any restrictions: 
+pred.descrip$cited <- d$cited
+pred.descrip$cites <- abstracts$cited_by_count
+
+
+# We want these to be categorical variables
+d$modal_country <- factor(d$modal_country)
+d$language <- factor(d$language)
+d$subject <- factor(abstracts$first_ASJC_subject_area)
+d$source_title <- factor(d$source_title)
+d$month <- factor(strftime(abstracts$date, format= "%m"))
+# except for pub year - keep that continuous
+d$year <- as.numeric(strftime(abstracts$date, format="%Y"))
+
+# bring in org affiliations
+d <- merge(d, affiliations, by="eid") # note that this drops papers
+                                      # w/out org info
+
+d$affiliation <- factor(d$affiliation)
+
+##### Restrictions:
+
+### do this explicitly so that changes are easy:
+d <- restrict(d, d$affiliation, 1)
+d <- restrict(d, d$subject, 1)
+d <- restrict(d, d$source_title, 1)
+d <- restrict(d, d$language, 1)
+d <- restrict(d, d$modal_country, 1)
+
+# n.authors
+# per author prior citations
+
+pred.descrip$covars <- d
+save(pred.descrip, file = "paper/data/prediction_descriptives.RData")
+
+
+rm(d, abstracts, affiliations)
+
--- a/code/prediction/02-build_textual_features.R
+++ b/code/prediction/02-build_textual_features.R
@@ -0,0 +1,56 @@
+library(data.table)
+
+
+# import ngram data
+# note that the file is not pushed to repository, but is available on
+# hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
+
+# Top 100,000 ngrams (?)
+ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
+                     header=TRUE, stringsAsFactors=FALSE)[,-3]
+names(ngrams)[1] <- "eid"
+
+subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
+                         stringsAsFactors=FALSE, sep="\t")[,c("eid",
+                         "first_ASJC_subject_area")]
+names(subjects)[2] <- "subject"
+
+# takes a couple of minutes:
+ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
+
+# only use ngrams that occur accross all (many?) subject areas
+subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
+    length(unique(x)))
+
+# summary(subject.by.ngram)
+#
+# library(txtplot)
+# txtdensity(log(subject.by.ngram))
+
+# Note:
+# The median number of subject areas per term is five. We'll cut it
+# off at terms that occur across at least 30 subject areas.
+
+top.ngrams <- ngrams[ngrams$term %in%
+                     names(subject.by.ngram[subject.by.ngram >
+                     30]),c("eid", "term")]
+
+rm(ngrams, subject.by.ngram, subjects)
+
+# convert to a wide format matrix of dichotomous variables
+library(reshape2)
+library(data.table)
+
+top.ngrams <- data.table(top.ngrams)
+setkey(top.ngrams, eid)
+
+top.ngrams[,vv:= TRUE]
+
+# took more than 20 minutes on hyak
+top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
+                          value.var = "vv")
+
+rm(top.ngrams)
+
+save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
+#load("processed_data/top.ngram.matrix.RData")
--- a/code/prediction/03-prediction_analysis.R
+++ b/code/prediction/03-prediction_analysis.R
@@ -0,0 +1,221 @@
+library(data.table)
+library(Matrix)
+library(glmnet)
+library(xtable)
+library(methods)
+
+predict.list <- NULL
+
+if(!exists("top.ngram.matrix")){
+    load("processed_data/top.ngram.matrix.RData")
+}
+
+if(!exists("pred.descrip")){
+    load("paper/data/prediction_descriptives.RData")
+    covars <- pred.descrip$covars
+}
+
+top.ngram.matrix <- data.table(top.ngram.matrix)
+setkey(top.ngram.matrix, eid)
+covars <- data.table(pred.descrip$covars)
+setkey(covars,eid)
+
+# restrict to the overlap of the two datasets
+covars <- covars[covars$eid %in% top.ngram.matrix$eid,]
+
+top.ngram.matrix <- top.ngram.matrix[top.ngram.matrix$eid %in%
+                                     covars$eid,]
+
+# rename the cited column in case it doesn't appear
+names(covars)[names(covars) == 'cited'] <- 'cited.x'
+
+# then merge also to facilitate some manipulations below
+d <- merge(covars, top.ngram.matrix, by="eid", all=FALSE)
+
+# Note that this duplicates some column names so X gets appended in a
+# few cases.
+
+# construct model matrices
+x.controls <- sparse.model.matrix(cited.x ~ language.x +
+                                    modal_country + month.x,
+                                   data=d)[,-1]
+
+x.aff <- sparse.model.matrix(cited.x ~ affiliation, data=d)[,-1]
+x.subj <- sparse.model.matrix(cited.x ~ subject.x, data=d)[,-1]
+x.venue <- sparse.model.matrix(cited.x ~ source_title, data=d)[,-1]
+
+x.ngrams <- as.matrix(subset(top.ngram.matrix, select=-eid))
+x.ngrams <- as(x.ngrams, "sparseMatrix")
+
+X <- cBind(x.controls, covars$year.x, covars$works.cited)
+X.aff <- cBind(X, x.aff)
+X.subj <- cBind(X.aff, x.subj)
+X.venue <- cBind(X.subj, x.venue)
+X.terms <- cBind(X.venue, x.ngrams)
+
+Y <- covars$cited
+
+### Hold-back sample for testing model performance later on:
+set.seed(20160719)
+holdback.index <- sample(nrow(X), round(nrow(X)*.1))
+
+X.hold <- X[holdback.index,]
+X.hold.aff <- X.aff[holdback.index,]
+X.hold.subj <- X.subj[holdback.index,]
+X.hold.venue <- X.venue[holdback.index,]
+X.hold.terms <- X.terms[holdback.index,]
+Y.hold <- Y[holdback.index]
+
+X.test <- X[-holdback.index,]
+X.test.aff <- X.aff[-holdback.index,]
+X.test.subj <- X.subj[-holdback.index,]
+X.test.venue <- X.venue[-holdback.index,]
+X.test.terms <- X.terms[-holdback.index,]
+Y.test <- Y[-holdback.index]
+
+###############  Models and prediction
+
+set.seed(20160719)
+
+m.con <- cv.glmnet(X.test, Y.test, alpha=1, family="binomial",
+                    type.measure="class")
+con.pred = predict(m.con, type="class", s="lambda.min",
+                    newx=X.hold)
+
+m.aff <- cv.glmnet(X.test.aff, Y.test, alpha=1, family="binomial",
+                    type.measure="class")
+aff.pred = predict(m.aff, type="class", s="lambda.min",
+                    newx=X.hold.aff)
+
+m.subj <- cv.glmnet(X.test.subj, Y.test, alpha=1, family="binomial",
+                    type.measure="class")
+subj.pred = predict(m.subj, type="class", s="lambda.min",
+                    newx=X.hold.subj)
+
+m.venue <- cv.glmnet(X.test.venue, Y.test, alpha=1, family="binomial",
+                    type.measure="class")
+venue.pred = predict(m.venue, type="class", s="lambda.min",
+                    newx=X.hold.venue)
+
+m.terms <- cv.glmnet(X.test.terms, Y.test, alpha=1, family="binomial",
+                    type.measure="class")
+terms.pred = predict(m.terms, type="class", s="lambda.min",
+                    newx=X.hold.terms)
+
+##########
+# Compare test set predictions against held-back sample:
+
+pred.df <- data.frame(cbind(con.pred, aff.pred, subj.pred,
+                          venue.pred, terms.pred))
+names(pred.df) <- c("Controls", "+ Affiliation", "+ Subject", "+ Venue",
+                          "+ Terms") 
+
+m.list <- list(m.con, m.aff, m.subj, m.venue, m.terms)
+
+# collect:
+# df
+# percent.deviance
+# nonzero coefficients
+# prediction error
+
+gen.m.summ.info <- function(model){
+    df <- round(tail(model$glmnet.fit$df, 1),0)
+    percent.dev <- round(tail(model$glmnet.fit$dev.ratio, 1),2)*100
+    cv.error <- round(tail(model$cvm,1),2)*100
+#    null.dev <- round(tail(model$glmnet.fit$nulldev),0)
+    out <- c(df, percent.dev, cv.error)
+    return(out)
+}
+
+gen.class.err <- function(pred, test){
+    props <- prop.table(table(pred, test))
+    err.sum <- round(sum(props[1,2], props[2,1]),2)*100
+    return(err.sum)
+}
+
+
+results.tab <- cbind(names(pred.df),data.frame(matrix(unlist(lapply(m.list,
+                                               gen.m.summ.info)),
+                                 byrow=T, nrow=5)))
+
+results.tab$class.err <- sapply(pred.df, function(x) gen.class.err(x, 
+                                                                   Y.hold))
+
+results.tab <- data.frame(lapply(results.tab, as.character))
+
+
+
+names(results.tab) <- c("Model", "N features", "Deviance (%)",
+                                               "CV error (%)", "Hold-back error (%)")
+
+
+print(xtable(results.tab,
+             caption=
+                 "Summary of fitted models predicting any citations. The ``Model'' column describes which features were included. The N features column shows the number of features included in the prediction. ``Deviance'' summarizes the goodness of fit as a percentage of the total deviance accounted for by the model. ``CV error'' (cross-validation error) reports the prediction error rates of each model in the cross-validation procedure conducted as part of the parameter estimation process. ``Hold-back error'' shows the prediction error on a random 10 percent subset of the original dataset not included in any of the model estimation procedures.",
+             label='tab:predict_models', align='llrrrr'),
+             include.rownames=FALSE)
+
+# Store the results:
+predict.list$results.tab <- results.tab
+
+
+
+
+############# Generate most salient coefficients
+nz.coefs <- data.frame(                       coef =
+                           colnames(X.test.terms)[which(
+                                       coef(m.terms, s="lambda.min")
+                                       != 0)],
+                       type = "term",
+                       beta =
+                           coef(m.terms,
+                                s="lambda.min")[which(coef(m.terms,
+                                                           s="lambda.min")
+                                                      != 0)])
+
+nz.coefs$coef <- as.character(nz.coefs$coef)
+nz.coefs$type <- as.character(nz.coefs$type)
+nz.coefs <- nz.coefs[order(-abs(nz.coefs$beta)),]
+
+# comparison:
+
+#nz.coefs$type <- "terms"
+nz.coefs$type[grepl("(Intercept)", nz.coefs$coef)] <- NA
+nz.coefs$type[grepl("source_title", nz.coefs$coef)] <- "venue"
+nz.coefs$type[grepl("subject.x", nz.coefs$coef)] <- "subject"
+nz.coefs$type[grepl("affiliation", nz.coefs$coef)] <- "affiliation"
+nz.coefs$type[grepl("month.x", nz.coefs$coef)] <- "month"
+nz.coefs$type[grepl("modal_country", nz.coefs$coef)] <- "country"
+nz.coefs$type[grepl("language", nz.coefs$coef)] <- "language"
+nz.coefs$type[grepl("^20[0-9]{2}$", nz.coefs$coef)] <- "year"
+
+
+# cleanup 
+nz.coefs$coef <- gsub("source_title", "", nz.coefs$coef)
+nz.coefs$coef <- gsub("subject.x", "", nz.coefs$coef)
+nz.coefs$coef <- gsub("affiliation","", nz.coefs$coef)
+nz.coefs$beta <- round(nz.coefs$beta, 3)
+names(nz.coefs) <- c("Feature", "Type", "Coefficient")
+
+predict.list$nz.coefs <- nz.coefs
+
+# table for all
+round(prop.table(table(nz.coefs$Type))*100, 2)
+
+# for top subsets
+round(prop.table(table(nz.coefs$Type[1:700]))*100, 2)
+round(prop.table(table(nz.coefs$Type[1:200]))*100, 2)
+round(prop.table(table(nz.coefs$Type[1:100]))*100, 2)
+
+print(xtable(
+    as.matrix(head(nz.coefs, 10)),
+    label='tab:nzcoefs',
+    caption='Feature, variable type, and beta value for top 100 non-zero coefficients estimated by the best fitting model with all features included.',
+    align='lllr'
+), include.rownames=FALSE)
+
+
+# output
+save(predict.list, file="paper/data/prediction.RData")
+
+
--- a/code/prediction/utils.R
+++ b/code/prediction/utils.R
@@ -0,0 +1,13 @@
+
+# Use this to check for underpopulated cells
+gen.counts <- function(df, c.var){
+    tapply(df[,"eid"], c.var, function(x) length(unique(x)))
+}
+
+# use this to remove underpopulated cells
+restrict <- function(df, c.var, c.min){
+    var.counts <- gen.counts(df, c.var)
+    out.df <- df[c.var %in% names(var.counts[var.counts >
+                                                   c.min]),] 
+    return(out.df)
+}
--- a/code/topic_modeling/00_topics_extraction.py
+++ b/code/topic_modeling/00_topics_extraction.py
@@ -0,0 +1,126 @@
+
+from time import time
+
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.decomposition import NMF, LatentDirichletAllocation
+import sys
+import csv
+import pandas as pd
+import argparse
+
+"""
+This code was inspired/copied from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html.
+
+It takes in an abstract file, and creates two outputs: The abstracts together with their topic distribution and a set of topics and the top words associated with each.
+"""
+
+n_samples = None # Enter an integer here for testing.
+n_features = 20000
+n_topics = 12
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Program to use LDA to create topics and topic distributions from a set of abstracts.')
+    parser.add_argument('-i', help='Abstracts file',
+            default='processed_data/abstracts.tsv')
+    parser.add_argument('-o', help='Where to output  results',
+            default='processed_data/abstracts_LDA.csv')
+    parser.add_argument('-t', help='Where to output topics and top words associated with them',
+            default='processed_data/top_words.csv')
+    args = parser.parse_args()
+
+    print("Loading dataset...")
+    t0 = time()
+    dataset, doc_data = get_abstracts(args.i)
+    data_samples = dataset[:n_samples]
+    doc_data = doc_data[:n_samples]
+    print("done in %0.3fs." % (time() - t0))
+
+    # Use tf (raw term count) features for LDA.
+    print("Extracting tf features for LDA...")
+    tf_vectorizer = CountVectorizer(max_df=0.95, # Terms that show up in > max_df of documents are ignored
+                                    min_df=2, # Terms that show up in < min_df of documents are ignored
+                                    max_features=n_features, # Only use the top max_features 
+                                    stop_words='english',
+                                    ngram_range=(1,2))
+    t0 = time()
+    tf = tf_vectorizer.fit_transform(data_samples)
+    print("done in %0.3fs." % (time() - t0))
+
+
+    print("Fitting LDA models with tf features, "
+          "n_samples=%d and n_features=%d..."
+          % (len(data_samples), n_features))
+    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
+                                    learning_method='online',
+                                    learning_offset=50.,
+                                    random_state=2017,
+                                    n_jobs=2)
+    t0 = time()
+    model = lda.fit(tf)
+    transformed_model = lda.fit_transform(tf)
+    print("done in %0.3fs." % (time() - t0))
+
+
+    # Change the values into a probability distribution for each abstract
+    topic_dist = [[topic/sum(abstract_topics) for topic in abstract_topics]
+                          for abstract_topics in transformed_model]
+
+    # Make the topic distribution into a dataframe
+    td = pd.DataFrame(topic_dist)
+    # Get the feature names (i.e., the words/terms)
+    tf_feature_names = tf_vectorizer.get_feature_names()
+
+
+    # Get the top words by topic
+    topic_words = get_top_words(lda, tf_feature_names, 20)
+    # Sort by how often topic is used
+    topic_words = topic_words.reindex_axis(sorted(topic_words.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
+
+    # Rearrange the columns by how often each topic is used
+    td = td.reindex_axis(sorted(td.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
+
+    topic_words.to_csv(args.t, index=False)
+
+    df = pd.DataFrame(doc_data)
+    df = df.join(td)
+
+    df.to_csv(args.o, index=False)
+
+def get_abstracts(fn):
+    with open(fn, 'r') as f:
+        in_csv = csv.DictReader(f, delimiter='\t')
+        abstracts = []
+        doc_data = []
+        for r in in_csv:
+            try:
+                curr_abstract = r['abstract']
+                # If this isn't really an abstract, then don't add it
+                if len(curr_abstract) > 5:
+                    # Add the abstracts to the corpus, and save the data
+                    abstracts.append(r['abstract'])
+                    doc_data.append(r)
+            except KeyError:
+                print(r)
+    return abstracts, doc_data
+
+def get_top_words(model, feature_names, n_top_words):
+    '''Takes the model, the words used, and the number of words requested.
+    Returns a dataframe of the top n_top_words for each topic'''
+    r = pd.DataFrame()
+    # For each topic
+    for i, topic in enumerate(model.components_):
+        # Get the top feature names, and put them in that column
+        r[i] = [add_quotes(feature_names[i])
+                    for i in topic.argsort()[:-n_top_words - 1:-1]]
+    return r
+
+def add_quotes(s):
+    '''Adds quotes around multiple term phrases'''
+    if " " in s:
+        s =  '"{}"'.format(s)
+    return s
+
+
+if __name__ == '__main__':
+    main()
--- a/code/topic_modeling/01_make_paper_files.py
+++ b/code/topic_modeling/01_make_paper_files.py
@@ -0,0 +1,103 @@
+'''Creates the figures and tables for LaTeX'''
+
+import pandas as pd
+import numpy as np
+import datetime
+import argparse
+import os
+
+topic_names = [
+    'Media Use',
+    'Social Network Analysis',
+    'Consumer Analsyis',
+    'Education',
+    'Quantitative Analysis',
+    'Information Spread',
+    'Health',
+    'Sentiment Analysis',
+    'News',
+    'HCI',
+    'Influence',
+    'Methodology'
+]
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics')
+    parser.add_argument('-a', help='Abstracts LDA file',
+            default='processed_data/abstracts_LDA.csv')
+    parser.add_argument('-w', help='Top words file',
+            default='processed_data/top_words.csv')
+    parser.add_argument('-t', help='Topic tables directory',
+            default='paper/tables/')
+    parser.add_argument('-o', help = 'RData output file location',
+            default = 'paper/data/topic_model_data.RData')
+
+    args = parser.parse_args()
+
+    # Make the top_words tables
+    tw = pd.read_csv(args.w)
+    # Add names
+    tw.columns = topic_names
+    # Save as 2 different tables, because they are too long
+    if not os.path.exists(args.t):
+        os.makedirs(args.t)
+    tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6])
+    tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:])
+
+    # Load the abstracts and topics data
+    df = pd.read_csv(args.a)
+    n_topics = len(tw.columns)
+    # Change to datetime
+    df.date = pd.to_datetime(df.date)
+
+    # Remove papers from 2016 since we don't have the entire year, so graphs are misleading
+    df = df[df.date <= pd.to_datetime('2015-12-31')]
+    df = df.set_index('date')
+    # Rename the last columns as the topic names
+    df.columns = list(df.columns[:-n_topics]) + topic_names
+    # Group by year, and get only the LDA columns
+    topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]]
+    # Get summary statistics for each topic
+    # Total amount published in each topic by year
+    topic_sums = topics_by_year.sum()
+    # Mean amount published in each topic
+    topic_means = topics_by_year.mean()
+    # Now, we weight the contributions by how much a paper has been cited.
+    # Remember, each document has a distribution of topics that it belongs to, so a given document might look like:
+    # T1: .5
+    # T2: .3
+    # T3: 0
+    # T4: .2
+    # To account for how influential a paper is, we take all of the topic columns for a document 
+    # and multiplies their weights by the logged citations the paper has received.
+    citation_weighted_topics = df[df.columns[-n_topics:]]
+    citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0)
+    weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum()
+
+    ## write data to R
+    # import code to write r modules and create our variable we'll write to
+    import rpy2.robjects as robjects
+    from rpy2.robjects import pandas2ri
+    pandas2ri.activate()
+
+
+    r = {'weighted_sums' : weighted_sums,
+         'topic_sums' : topic_sums,
+         'topic_means' : topic_means }
+
+    for var_name, x in r.items():
+        robjects.r.assign(var_name.replace("_", "."), x)
+
+    if not os.path.exists(os.path.dirname(args.o)):
+        os.makedirs(os.path.dirname(args.o))
+
+    robjects.r('save({},file = "{}")'.format(
+                                            ",".join([k.replace("_", ".") for k in r.keys()]),
+                                            args.o
+                                            ))
+    robjects.r("rm(list=ls())")
+
+
+if __name__ == '__main__':
+    main()