We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
44 lines
1.7 KiB
Python
44 lines
1.7 KiB
Python
from request_functions import *
|
|
import argparse
|
|
import json
|
|
import subprocess
|
|
from os import remove
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
|
|
parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
|
|
parser.add_argument('-o', help='Where to append JSON results')
|
|
args = parser.parse_args()
|
|
|
|
with open(args.i, 'r') as f:
|
|
# Make a dictionary of eid:citation count for each line in the file
|
|
eids = {}
|
|
for line in f:
|
|
l = json.loads(line)
|
|
eids[l['eid']] = l['citedby-count']
|
|
|
|
# If the script gets interrupted, we need to start where we left off
|
|
try:
|
|
# Open the output file, and grab all of the eids which are already completed
|
|
with open(args.o, 'r') as f:
|
|
completed_eids = [json.loads(l)['parent_eid'] for l in f]
|
|
# Remove those which came from the last id (since we may have missed some)
|
|
if len(completed_eids) > 0:
|
|
last_eid = completed_eids.pop()
|
|
# Remove all of the lines which came from the last eid
|
|
subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
|
|
# Hopefully everything has worked out, because here we blow away the backup
|
|
remove('{}.bak'.format(args.o))
|
|
except IOError:
|
|
# If the file doesn't exist, then there aren't any completed eids
|
|
completed_eids = []
|
|
|
|
with open(args.o, 'a') as out_file:
|
|
for eid, citation_count in eids.items():
|
|
if citation_count != '0' and eid not in completed_eids:
|
|
get_cited_by(eid, out_file)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|