initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
43
code/data_collection/02_get_cited_by.py
Normal file
43
code/data_collection/02_get_cited_by.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from request_functions import *
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
from os import remove
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
|
||||
parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
|
||||
parser.add_argument('-o', help='Where to append JSON results')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as f:
|
||||
# Make a dictionary of eid:citation count for each line in the file
|
||||
eids = {}
|
||||
for line in f:
|
||||
l = json.loads(line)
|
||||
eids[l['eid']] = l['citedby-count']
|
||||
|
||||
# If the script gets interrupted, we need to start where we left off
|
||||
try:
|
||||
# Open the output file, and grab all of the eids which are already completed
|
||||
with open(args.o, 'r') as f:
|
||||
completed_eids = [json.loads(l)['parent_eid'] for l in f]
|
||||
# Remove those which came from the last id (since we may have missed some)
|
||||
if len(completed_eids) > 0:
|
||||
last_eid = completed_eids.pop()
|
||||
# Remove all of the lines which came from the last eid
|
||||
subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
|
||||
# Hopefully everything has worked out, because here we blow away the backup
|
||||
remove('{}.bak'.format(args.o))
|
||||
except IOError:
|
||||
# If the file doesn't exist, then there aren't any completed eids
|
||||
completed_eids = []
|
||||
|
||||
with open(args.o, 'a') as out_file:
|
||||
for eid, citation_count in eids.items():
|
||||
if citation_count != '0' and eid not in completed_eids:
|
||||
get_cited_by(eid, out_file)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user