1
0
Files
social-media-chapter/code/data_collection/01_get_abstracts.py
Benjamin Mako Hill dd420c77de initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes
API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00

57 lines
1.8 KiB
Python

from request_functions import *
import argparse
import json
import subprocess
def main():
parser = argparse.ArgumentParser(description='Output JSON of abstracts and bibliography of all articles passed in.')
parser.add_argument('-i', help='JSON file which includes eids')
parser.add_argument('--eid', '-e', help='Single eid')
parser.add_argument('-o', help='Where to append JSON results')
args = parser.parse_args()
if args.eid:
eids = [args.eid]
elif args.i:
with open(args.i, 'r') as f:
eids = [json.loads(line)['eid'] for line in f]
else:
print('Need to either pass in an eid or a json file with eids')
# If the script gets interrupted, we need to start where we left off
try:
errors = []
with open(args.o, 'r') as f:
completed_eids = []
for line in f:
try:
result = json.loads(line)
completed_eids.append(result['abstracts-retrieval-response']['coredata']['eid'])
except ValueError:
errors.append(line)
except IOError as e:
completed_eids = []
print('{} completed eids'.format(len(completed_eids)))
with open(args.o, 'a') as out_file:
for eid in eids:
if eid not in completed_eids:
result = get_abstract(eid)
if result:
out_file.write(result)
out_file.write('\n')
else:
errors.append(eid)
if len(errors) > 0:
with open('raw_data/missing_eids.json', 'a') as l:
# Add the bad lines from the output file
(l.write(e) for e in errors)
if __name__ == '__main__':
main()