social-media-chapter/code/data_collection/01_get_abstracts.py

from request_functions import *
import argparse
import json
import subprocess


def main():

    parser = argparse.ArgumentParser(description='Output JSON of abstracts and bibliography of all articles passed in.')
    parser.add_argument('-i', help='JSON file which includes eids')
    parser.add_argument('--eid', '-e', help='Single eid')
    parser.add_argument('-o', help='Where to append JSON results')
    args = parser.parse_args()

    if args.eid:
        eids = [args.eid]
    elif args.i:
        with open(args.i, 'r') as f:
            eids = [json.loads(line)['eid'] for line in f]
    else:
        print('Need to either pass in an eid or a json file with eids')

    # If the script gets interrupted, we need to start where we left off
    try:
        errors = []
        with open(args.o, 'r') as f:
            completed_eids = []
            for line in f:
                try:
                    result = json.loads(line)
                    completed_eids.append(result['abstracts-retrieval-response']['coredata']['eid'])
                except ValueError:
                    errors.append(line)
    except IOError as e:
        completed_eids = []


    print('{} completed eids'.format(len(completed_eids)))
    with open(args.o, 'a') as out_file:
            for eid in eids:
                if eid not in completed_eids:
                    result = get_abstract(eid)
                    if result:
                        out_file.write(result)
                        out_file.write('\n')
                    else:
                        errors.append(eid)

    if len(errors) > 0:
        with open('raw_data/missing_eids.json', 'a') as l:
            # Add the bad lines from the output file
            (l.write(e) for e in errors)


if __name__ == '__main__':
    main()