initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
56
code/data_collection/01_get_abstracts.py
Normal file
56
code/data_collection/01_get_abstracts.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from request_functions import *
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Output JSON of abstracts and bibliography of all articles passed in.')
|
||||
parser.add_argument('-i', help='JSON file which includes eids')
|
||||
parser.add_argument('--eid', '-e', help='Single eid')
|
||||
parser.add_argument('-o', help='Where to append JSON results')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.eid:
|
||||
eids = [args.eid]
|
||||
elif args.i:
|
||||
with open(args.i, 'r') as f:
|
||||
eids = [json.loads(line)['eid'] for line in f]
|
||||
else:
|
||||
print('Need to either pass in an eid or a json file with eids')
|
||||
|
||||
# If the script gets interrupted, we need to start where we left off
|
||||
try:
|
||||
errors = []
|
||||
with open(args.o, 'r') as f:
|
||||
completed_eids = []
|
||||
for line in f:
|
||||
try:
|
||||
result = json.loads(line)
|
||||
completed_eids.append(result['abstracts-retrieval-response']['coredata']['eid'])
|
||||
except ValueError:
|
||||
errors.append(line)
|
||||
except IOError as e:
|
||||
completed_eids = []
|
||||
|
||||
|
||||
print('{} completed eids'.format(len(completed_eids)))
|
||||
with open(args.o, 'a') as out_file:
|
||||
for eid in eids:
|
||||
if eid not in completed_eids:
|
||||
result = get_abstract(eid)
|
||||
if result:
|
||||
out_file.write(result)
|
||||
out_file.write('\n')
|
||||
else:
|
||||
errors.append(eid)
|
||||
|
||||
if len(errors) > 0:
|
||||
with open('raw_data/missing_eids.json', 'a') as l:
|
||||
# Add the bad lines from the output file
|
||||
(l.write(e) for e in errors)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user