social-media-chapter/code/data_processing/04_make_paper_subject_table.py

import json
import argparse
import csv

def main():

    parser = argparse.ArgumentParser(description='Generate paper to subject mapping file from abstracts file')
    parser.add_argument('-i', help='Abstract file')
    parser.add_argument('-o', help='TSV output file')
    args = parser.parse_args()

    with open(args.i, 'r') as i:
        with open(args.o, 'w') as o:
            output = csv.writer(o, delimiter='\t')
            output.writerow(['paper_eid','subject',
                'subject_code'])
            for line in i:
                entries = get_entries(line)
                for entry in entries:
                    output.writerow(entry)


def get_entries(l):
    json_response = json.loads(l)
    full = json_response['abstracts-retrieval-response']
    eid = full['coredata']['eid']
    subjects = get_subjects(full)
    # Prepend the eid, and return the subjects
    return [[eid,s[0],s[1]] for s in subjects]
    return []


def get_subjects(abstract_response):
    try:
        subject_info = make_list(abstract_response['subject-areas']['subject-area'])
    except KeyError:
        print(result)
        raise
    result = []
    for s in subject_info:
        # Get the subject name and code, and append them
        result.append([s['$'],s['@code']])
    return result


def make_list(list_or_dict):
    return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]

if __name__ == '__main__':
    main()