We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
63 lines
2.0 KiB
Python
63 lines
2.0 KiB
Python
import json
|
|
import argparse
|
|
import csv
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(description='Generate paper to affiliation mapping file from abstracts file')
|
|
parser.add_argument('-i', help='Abstract file')
|
|
parser.add_argument('-o', help='TSV output file')
|
|
args = parser.parse_args()
|
|
|
|
with open(args.i, 'r') as i:
|
|
with open(args.o, 'w') as o:
|
|
output = csv.writer(o, delimiter='\t')
|
|
output.writerow(['paper_eid','affiliation_id',
|
|
'organization','country'])
|
|
for line in i:
|
|
entries = get_entries(line)
|
|
for entry in entries:
|
|
output.writerow(entry)
|
|
|
|
|
|
def get_entries(l):
|
|
json_response = json.loads(l)
|
|
full = json_response['abstracts-retrieval-response']
|
|
head = full['item']['bibrecord']['head']
|
|
eid = full['coredata']['eid']
|
|
countries = get_aff_info(head, 'country')
|
|
affiliation_ids = get_aff_info(head, '@afid')
|
|
org_names = get_aff_info(head, 'organization')
|
|
if countries:
|
|
result = [[eid, affiliation_ids[i], org_names[i], countries[i]]
|
|
for i in range(len(countries))]
|
|
return result
|
|
return []
|
|
|
|
def get_aff_info(head, affiliation_key):
|
|
aff_info = []
|
|
try:
|
|
affiliations = make_list(head['author-group'])
|
|
except KeyError:
|
|
return None
|
|
for x in affiliations:
|
|
if x is None:
|
|
continue
|
|
try:
|
|
curr_inst = x['affiliation'][affiliation_key]
|
|
# May return a string or a list. If it's a list, then
|
|
# return the final value of that list (This is the base organization)
|
|
if isinstance(curr_inst, list):
|
|
curr_inst = [x['$'] for x in curr_inst][-1]
|
|
aff_info.append(curr_inst)
|
|
except KeyError:
|
|
# If there isn't affiliation info for these authors, return empty str
|
|
aff_info.append('')
|
|
return aff_info
|
|
|
|
def make_list(list_or_dict):
|
|
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
|
|
|
|
if __name__ == '__main__':
|
|
main()
|