1
0
Files
social-media-chapter/code/data_collection/02_get_cited_by.py
Benjamin Mako Hill dd420c77de initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes
API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00

44 lines
1.7 KiB
Python

from request_functions import *
import argparse
import json
import subprocess
from os import remove
def main():
parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
parser.add_argument('-o', help='Where to append JSON results')
args = parser.parse_args()
with open(args.i, 'r') as f:
# Make a dictionary of eid:citation count for each line in the file
eids = {}
for line in f:
l = json.loads(line)
eids[l['eid']] = l['citedby-count']
# If the script gets interrupted, we need to start where we left off
try:
# Open the output file, and grab all of the eids which are already completed
with open(args.o, 'r') as f:
completed_eids = [json.loads(l)['parent_eid'] for l in f]
# Remove those which came from the last id (since we may have missed some)
if len(completed_eids) > 0:
last_eid = completed_eids.pop()
# Remove all of the lines which came from the last eid
subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
# Hopefully everything has worked out, because here we blow away the backup
remove('{}.bak'.format(args.o))
except IOError:
# If the file doesn't exist, then there aren't any completed eids
completed_eids = []
with open(args.o, 'a') as out_file:
for eid, citation_count in eids.items():
if citation_count != '0' and eid not in completed_eids:
get_cited_by(eid, out_file)
if __name__ == '__main__':
main()