initial import of material for public archive into git

We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00
commit dd420c77de
41 changed files with 7069 additions and 0 deletions
--- a/code/data_collection/00_get_search_results.py
+++ b/code/data_collection/00_get_search_results.py
@@ -0,0 +1,24 @@
+import argparse
+from request_functions import *
+
+'''
+This script takes in a search query and an output file. It queries the scopus API to find all papers that match the search query, and saves them to the output file.
+
+Unlike some of the other scripts in this directory, it does not try to determine the state - if you restart the script, it will start over and blow away whatever you had saved before.
+'''
+
+years = range(2004, 2017)
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Output JSON of all articles matching search query')
+    parser.add_argument('-q', help='Search query', required=True)
+    parser.add_argument('-o', help='Where to append JSON results')
+    args = parser.parse_args()
+
+    with open(args.o, 'w') as out_file:
+        for year in years:
+            get_search_results(args.q, out_file, year=year)
+
+if __name__ == '__main__':
+    main()
--- a/code/data_collection/01_get_abstracts.py
+++ b/code/data_collection/01_get_abstracts.py
@@ -0,0 +1,56 @@
+from request_functions import *
+import argparse
+import json
+import subprocess
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Output JSON of abstracts and bibliography of all articles passed in.')
+    parser.add_argument('-i', help='JSON file which includes eids')
+    parser.add_argument('--eid', '-e', help='Single eid')
+    parser.add_argument('-o', help='Where to append JSON results')
+    args = parser.parse_args()
+
+    if args.eid:
+        eids = [args.eid]
+    elif args.i:
+        with open(args.i, 'r') as f:
+            eids = [json.loads(line)['eid'] for line in f]
+    else:
+        print('Need to either pass in an eid or a json file with eids')
+
+    # If the script gets interrupted, we need to start where we left off
+    try:
+        errors = []
+        with open(args.o, 'r') as f:
+            completed_eids = []
+            for line in f:
+                try:
+                    result = json.loads(line)
+                    completed_eids.append(result['abstracts-retrieval-response']['coredata']['eid'])
+                except ValueError:
+                    errors.append(line)
+    except IOError as e:
+        completed_eids = []
+
+
+    print('{} completed eids'.format(len(completed_eids)))
+    with open(args.o, 'a') as out_file:
+            for eid in eids:
+                if eid not in completed_eids:
+                    result = get_abstract(eid)
+                    if result:
+                        out_file.write(result)
+                        out_file.write('\n')
+                    else:
+                        errors.append(eid)
+
+    if len(errors) > 0:
+        with open('raw_data/missing_eids.json', 'a') as l:
+            # Add the bad lines from the output file
+            (l.write(e) for e in errors)
+
+
+if __name__ == '__main__':
+    main()
--- a/code/data_collection/02_get_cited_by.py
+++ b/code/data_collection/02_get_cited_by.py
@@ -0,0 +1,43 @@
+from request_functions import *
+import argparse
+import json
+import subprocess
+from os import remove
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
+    parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
+    parser.add_argument('-o', help='Where to append JSON results')
+    args = parser.parse_args()
+
+    with open(args.i, 'r') as f:
+        # Make a dictionary of eid:citation count for each line in the file
+        eids = {}
+        for line in f:
+            l = json.loads(line)
+            eids[l['eid']] = l['citedby-count']
+
+    # If the script gets interrupted, we need to start where we left off
+    try:
+        # Open the output file, and grab all of the eids which are already completed
+        with open(args.o, 'r') as f:
+            completed_eids = [json.loads(l)['parent_eid'] for l in f]
+        # Remove those which came from the last id (since we may have missed some)
+        if len(completed_eids) > 0:
+            last_eid = completed_eids.pop()
+            # Remove all of the lines which came from the last eid
+            subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
+            # Hopefully everything has worked out, because here we blow away the backup
+            remove('{}.bak'.format(args.o))
+    except IOError:
+        # If the file doesn't exist, then there aren't any completed eids
+        completed_eids = []
+
+    with open(args.o, 'a') as out_file:
+        for eid, citation_count in eids.items():
+            if citation_count != '0' and eid not in completed_eids:
+                get_cited_by(eid, out_file)
+
+if __name__ == '__main__':
+    main()
--- a/code/data_collection/request_functions.py
+++ b/code/data_collection/request_functions.py
@@ -0,0 +1,166 @@
+import requests
+from datetime import datetime
+from scopus_api import key as API_KEY
+import json
+import os
+import logging
+import re
+
+logging.basicConfig(level=logging.DEBUG)
+
+RETRY_COUNT = 5
+TIMEOUT_SECS = 10
+
+# Initialize a global session object
+s = requests.Session()
+s.headers.update({'X-ELS-APIKey' : API_KEY,
+            'X-ELS-ResourceVersion' : 'XOCS',
+            'Accept' : 'application/json'})
+
+def get_token(location_id = None):
+    '''Given a location_id, gets an authentication token'''
+    print('Getting a token')
+    api_resource = 'http://api.elsevier.com/authenticate'
+    # Parameters
+    payload = {'platform':'SCOPUS',
+            'choice': location_id}
+    r = s.get(api_resource, params = payload)
+    r.raise_for_status()
+    s.headers['X-ELS-AuthToken'] = r.json()['authenticate-response']['authtoken']
+
+def get_search_results(query, output_file, results_per_call = 200,
+        tot_results=None, year=None, sort='+title', citation_call=False):
+    '''Handles getting search results. Takes a query and an output
+    file. Writes as many of the search results as possible to the
+    output file as JSON dictionaries, one per line.'''
+    result_set = []
+    results_added = 0
+    def curr_call(start=0, count=results_per_call):
+        '''Shorthand for the current call: DRY'''
+        return make_search_call(query, start=start,
+            count=count, year=year, sort=sort)
+    if tot_results == None:
+        # Call the API initially to figure out how many results there are, and write the results
+        initial_results = curr_call(count=results_per_call)
+        tot_results = int(initial_results['search-results']['opensearch:totalResults'])
+        result_set.append((initial_results, sort))
+        results_added += results_per_call
+    logging.debug("Total results: {}".format(tot_results))
+
+    if tot_results == 0:
+        return None
+    if tot_results > 5000:
+            # If this is just one year, we can't get any more granular, and
+            # we need to return what we can.
+        if tot_results > 10000:
+            print("{} results for {}. We can only retrieve 10,000".format(tot_results, year))
+            first_half = last_half = 5000
+        else:
+            # Get half, and correct for odd # of results
+            first_half = tot_results//2 + tot_results % 2
+            last_half = tot_results//2
+        # Break the search into the first half and the bottom half of results.
+        get_search_results(query, output_file,
+               year = year,
+               tot_results=first_half)
+         # Get the other half
+        get_search_results(query, output_file,
+                year = year,
+                tot_results = last_half, sort='-title')
+# If there are 5000 or fewer to retrieve, then get them
+    else:
+        logging.debug('Retrieving {} results'.format(tot_results))
+        # As long as there are more citations to retrieve, then do it, and write
+        # them to the file
+        while results_added < tot_results:
+            # If we are near the end, then only get as many results as are left.
+            to_retrieve = min(results_per_call, (tot_results - results_added))
+            curr_results = curr_call(start=results_added, count=to_retrieve)
+            result_set.append((curr_results, sort))
+            results_added += results_per_call
+    # This is hacky, but I'm doing it
+    # If this is a citation call, then construct metadata to be written with the result
+    if citation_call:
+        metadata = {'parent_eid': re.match(r'refeid\((.*)\)', query).group(1)}
+    else:
+        metadata = {}
+    write_results(result_set, output_file, metadata)
+
+def write_results(result_set, output_file, metadata={}):
+    for x in result_set:
+        search_json = x[0]
+        to_reverse = x[1].startswith('-')
+        try:
+            results = [x for x in search_json['search-results']['entry']]
+        except KeyError:
+            raise
+        if to_reverse:
+            results = results[::-1]
+        for x in results:
+            for k, v in metadata.items():
+                x[k] = v
+            json.dump(x, output_file)
+            output_file.write('\n')
+
+
+def make_search_call(query, start=0, count=200,
+        sort='+title', year=None,
+        retry_limit = RETRY_COUNT,
+        timeout_secs = TIMEOUT_SECS):
+    api_resource = "https://api.elsevier.com/content/search/scopus"
+    # Parameters
+    payload = {'query':query,
+            'count':count,
+            'start':start,
+            'sort': sort,
+            'date': year}
+    for _ in range(retry_limit):
+        try:
+            r = s.get(api_resource,
+                    params = payload,
+                    timeout = timeout_secs)
+            logging.debug(r.url)
+            if r.status_code == 401:
+                get_token()
+                continue
+            if r.status_code == 400:
+                raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
+            break
+        except requests.exceptions.Timeout:
+            pass
+    else:
+        raise requests.exceptions.Timeout('Timeout Error')
+
+    r.raise_for_status()
+    return r.json()
+
+
+def get_cited_by(eid, output_file):
+    return get_search_results('refeid({})'.format(eid), output_file, results_per_call=200,
+            citation_call = True)
+
+
+def get_abstract(eid, retry_limit = RETRY_COUNT,
+        timeout_secs = TIMEOUT_SECS):
+    api_resource = "http://api.elsevier.com/content/abstract/eid/{}".format(eid)
+    # Parameters
+    payload = {}
+    for _ in range(retry_limit):
+        try:
+            r = s.get(api_resource,
+                    params = payload,
+                    timeout = timeout_secs)
+            if r.status_code == 401:
+                get_token()
+                continue
+            if r.status_code == 400:
+                raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
+            break
+        except requests.exceptions.Timeout:
+            pass
+    else:
+        raise requests.exceptions.Timeout('Timeout Error')
+    if r.status_code == 404:
+        return None
+    r.raise_for_status()
+    return r.content.decode('utf-8')
--- a/code/data_collection/scopus_api.py
+++ b/code/data_collection/scopus_api.py
@@ -0,0 +1 @@
+key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'