1
0

initial import of material for public archive into git

We're creating a fresh archive because the history for our old chapter includes
API keys, data files, and other material we can't share.
This commit is contained in:
2018-01-21 17:15:51 -08:00
commit dd420c77de
41 changed files with 7069 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
import argparse
from request_functions import *
'''
This script takes in a search query and an output file. It queries the scopus API to find all papers that match the search query, and saves them to the output file.
Unlike some of the other scripts in this directory, it does not try to determine the state - if you restart the script, it will start over and blow away whatever you had saved before.
'''
years = range(2004, 2017)
def main():
parser = argparse.ArgumentParser(description='Output JSON of all articles matching search query')
parser.add_argument('-q', help='Search query', required=True)
parser.add_argument('-o', help='Where to append JSON results')
args = parser.parse_args()
with open(args.o, 'w') as out_file:
for year in years:
get_search_results(args.q, out_file, year=year)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,56 @@
from request_functions import *
import argparse
import json
import subprocess
def main():
parser = argparse.ArgumentParser(description='Output JSON of abstracts and bibliography of all articles passed in.')
parser.add_argument('-i', help='JSON file which includes eids')
parser.add_argument('--eid', '-e', help='Single eid')
parser.add_argument('-o', help='Where to append JSON results')
args = parser.parse_args()
if args.eid:
eids = [args.eid]
elif args.i:
with open(args.i, 'r') as f:
eids = [json.loads(line)['eid'] for line in f]
else:
print('Need to either pass in an eid or a json file with eids')
# If the script gets interrupted, we need to start where we left off
try:
errors = []
with open(args.o, 'r') as f:
completed_eids = []
for line in f:
try:
result = json.loads(line)
completed_eids.append(result['abstracts-retrieval-response']['coredata']['eid'])
except ValueError:
errors.append(line)
except IOError as e:
completed_eids = []
print('{} completed eids'.format(len(completed_eids)))
with open(args.o, 'a') as out_file:
for eid in eids:
if eid not in completed_eids:
result = get_abstract(eid)
if result:
out_file.write(result)
out_file.write('\n')
else:
errors.append(eid)
if len(errors) > 0:
with open('raw_data/missing_eids.json', 'a') as l:
# Add the bad lines from the output file
(l.write(e) for e in errors)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,43 @@
from request_functions import *
import argparse
import json
import subprocess
from os import remove
def main():
parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
parser.add_argument('-o', help='Where to append JSON results')
args = parser.parse_args()
with open(args.i, 'r') as f:
# Make a dictionary of eid:citation count for each line in the file
eids = {}
for line in f:
l = json.loads(line)
eids[l['eid']] = l['citedby-count']
# If the script gets interrupted, we need to start where we left off
try:
# Open the output file, and grab all of the eids which are already completed
with open(args.o, 'r') as f:
completed_eids = [json.loads(l)['parent_eid'] for l in f]
# Remove those which came from the last id (since we may have missed some)
if len(completed_eids) > 0:
last_eid = completed_eids.pop()
# Remove all of the lines which came from the last eid
subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
# Hopefully everything has worked out, because here we blow away the backup
remove('{}.bak'.format(args.o))
except IOError:
# If the file doesn't exist, then there aren't any completed eids
completed_eids = []
with open(args.o, 'a') as out_file:
for eid, citation_count in eids.items():
if citation_count != '0' and eid not in completed_eids:
get_cited_by(eid, out_file)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,166 @@
import requests
from datetime import datetime
from scopus_api import key as API_KEY
import json
import os
import logging
import re
logging.basicConfig(level=logging.DEBUG)
RETRY_COUNT = 5
TIMEOUT_SECS = 10
# Initialize a global session object
s = requests.Session()
s.headers.update({'X-ELS-APIKey' : API_KEY,
'X-ELS-ResourceVersion' : 'XOCS',
'Accept' : 'application/json'})
def get_token(location_id = None):
'''Given a location_id, gets an authentication token'''
print('Getting a token')
api_resource = 'http://api.elsevier.com/authenticate'
# Parameters
payload = {'platform':'SCOPUS',
'choice': location_id}
r = s.get(api_resource, params = payload)
r.raise_for_status()
s.headers['X-ELS-AuthToken'] = r.json()['authenticate-response']['authtoken']
def get_search_results(query, output_file, results_per_call = 200,
tot_results=None, year=None, sort='+title', citation_call=False):
'''Handles getting search results. Takes a query and an output
file. Writes as many of the search results as possible to the
output file as JSON dictionaries, one per line.'''
result_set = []
results_added = 0
def curr_call(start=0, count=results_per_call):
'''Shorthand for the current call: DRY'''
return make_search_call(query, start=start,
count=count, year=year, sort=sort)
if tot_results == None:
# Call the API initially to figure out how many results there are, and write the results
initial_results = curr_call(count=results_per_call)
tot_results = int(initial_results['search-results']['opensearch:totalResults'])
result_set.append((initial_results, sort))
results_added += results_per_call
logging.debug("Total results: {}".format(tot_results))
if tot_results == 0:
return None
if tot_results > 5000:
# If this is just one year, we can't get any more granular, and
# we need to return what we can.
if tot_results > 10000:
print("{} results for {}. We can only retrieve 10,000".format(tot_results, year))
first_half = last_half = 5000
else:
# Get half, and correct for odd # of results
first_half = tot_results//2 + tot_results % 2
last_half = tot_results//2
# Break the search into the first half and the bottom half of results.
get_search_results(query, output_file,
year = year,
tot_results=first_half)
# Get the other half
get_search_results(query, output_file,
year = year,
tot_results = last_half, sort='-title')
# If there are 5000 or fewer to retrieve, then get them
else:
logging.debug('Retrieving {} results'.format(tot_results))
# As long as there are more citations to retrieve, then do it, and write
# them to the file
while results_added < tot_results:
# If we are near the end, then only get as many results as are left.
to_retrieve = min(results_per_call, (tot_results - results_added))
curr_results = curr_call(start=results_added, count=to_retrieve)
result_set.append((curr_results, sort))
results_added += results_per_call
# This is hacky, but I'm doing it
# If this is a citation call, then construct metadata to be written with the result
if citation_call:
metadata = {'parent_eid': re.match(r'refeid\((.*)\)', query).group(1)}
else:
metadata = {}
write_results(result_set, output_file, metadata)
def write_results(result_set, output_file, metadata={}):
for x in result_set:
search_json = x[0]
to_reverse = x[1].startswith('-')
try:
results = [x for x in search_json['search-results']['entry']]
except KeyError:
raise
if to_reverse:
results = results[::-1]
for x in results:
for k, v in metadata.items():
x[k] = v
json.dump(x, output_file)
output_file.write('\n')
def make_search_call(query, start=0, count=200,
sort='+title', year=None,
retry_limit = RETRY_COUNT,
timeout_secs = TIMEOUT_SECS):
api_resource = "https://api.elsevier.com/content/search/scopus"
# Parameters
payload = {'query':query,
'count':count,
'start':start,
'sort': sort,
'date': year}
for _ in range(retry_limit):
try:
r = s.get(api_resource,
params = payload,
timeout = timeout_secs)
logging.debug(r.url)
if r.status_code == 401:
get_token()
continue
if r.status_code == 400:
raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
break
except requests.exceptions.Timeout:
pass
else:
raise requests.exceptions.Timeout('Timeout Error')
r.raise_for_status()
return r.json()
def get_cited_by(eid, output_file):
return get_search_results('refeid({})'.format(eid), output_file, results_per_call=200,
citation_call = True)
def get_abstract(eid, retry_limit = RETRY_COUNT,
timeout_secs = TIMEOUT_SECS):
api_resource = "http://api.elsevier.com/content/abstract/eid/{}".format(eid)
# Parameters
payload = {}
for _ in range(retry_limit):
try:
r = s.get(api_resource,
params = payload,
timeout = timeout_secs)
if r.status_code == 401:
get_token()
continue
if r.status_code == 400:
raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
break
except requests.exceptions.Timeout:
pass
else:
raise requests.exceptions.Timeout('Timeout Error')
if r.status_code == 404:
return None
r.raise_for_status()
return r.content.decode('utf-8')

View File

@@ -0,0 +1 @@
key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'