1
0
Files
social-media-chapter/code/data_processing/03_make_paper_aff_table.py
Benjamin Mako Hill dd420c77de initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes
API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00

63 lines
2.0 KiB
Python

import json
import argparse
import csv
def main():
parser = argparse.ArgumentParser(description='Generate paper to affiliation mapping file from abstracts file')
parser.add_argument('-i', help='Abstract file')
parser.add_argument('-o', help='TSV output file')
args = parser.parse_args()
with open(args.i, 'r') as i:
with open(args.o, 'w') as o:
output = csv.writer(o, delimiter='\t')
output.writerow(['paper_eid','affiliation_id',
'organization','country'])
for line in i:
entries = get_entries(line)
for entry in entries:
output.writerow(entry)
def get_entries(l):
json_response = json.loads(l)
full = json_response['abstracts-retrieval-response']
head = full['item']['bibrecord']['head']
eid = full['coredata']['eid']
countries = get_aff_info(head, 'country')
affiliation_ids = get_aff_info(head, '@afid')
org_names = get_aff_info(head, 'organization')
if countries:
result = [[eid, affiliation_ids[i], org_names[i], countries[i]]
for i in range(len(countries))]
return result
return []
def get_aff_info(head, affiliation_key):
aff_info = []
try:
affiliations = make_list(head['author-group'])
except KeyError:
return None
for x in affiliations:
if x is None:
continue
try:
curr_inst = x['affiliation'][affiliation_key]
# May return a string or a list. If it's a list, then
# return the final value of that list (This is the base organization)
if isinstance(curr_inst, list):
curr_inst = [x['$'] for x in curr_inst][-1]
aff_info.append(curr_inst)
except KeyError:
# If there isn't affiliation info for these authors, return empty str
aff_info.append('')
return aff_info
def make_list(list_or_dict):
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
if __name__ == '__main__':
main()