social-media-chapter/code/data_collection/02_get_cited_by.py

from request_functions import *
import argparse
import json
import subprocess
from os import remove

def main():

    parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
    parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
    parser.add_argument('-o', help='Where to append JSON results')
    args = parser.parse_args()

    with open(args.i, 'r') as f:
        # Make a dictionary of eid:citation count for each line in the file
        eids = {}
        for line in f:
            l = json.loads(line)
            eids[l['eid']] = l['citedby-count']

    # If the script gets interrupted, we need to start where we left off
    try:
        # Open the output file, and grab all of the eids which are already completed
        with open(args.o, 'r') as f:
            completed_eids = [json.loads(l)['parent_eid'] for l in f]
        # Remove those which came from the last id (since we may have missed some)
        if len(completed_eids) > 0:
            last_eid = completed_eids.pop()
            # Remove all of the lines which came from the last eid
            subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
            # Hopefully everything has worked out, because here we blow away the backup
            remove('{}.bak'.format(args.o))
    except IOError:
        # If the file doesn't exist, then there aren't any completed eids
        completed_eids = []

    with open(args.o, 'a') as out_file:
        for eid, citation_count in eids.items():
            if citation_count != '0' and eid not in completed_eids:
                get_cited_by(eid, out_file)

if __name__ == '__main__':
    main()