initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
177
code/data_processing/00_abstracts_to_tsv.py
Normal file
177
code/data_processing/00_abstracts_to_tsv.py
Normal file
@@ -0,0 +1,177 @@
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
import json
|
||||
import argparse
|
||||
import csv
|
||||
import random
|
||||
|
||||
random.seed(2017)
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Change a big ugly abstract file to a nice CSV')
|
||||
parser.add_argument('-i', help='Abstract file')
|
||||
parser.add_argument('-o', help='TSV output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as i:
|
||||
with open(args.o, 'w') as o:
|
||||
# Have to get the field names
|
||||
first_line = clean_abstract(json.loads(next(i)))
|
||||
fieldnames = first_line.keys()
|
||||
output = csv.DictWriter(o, fieldnames, delimiter='\t')
|
||||
output.writeheader()
|
||||
output.writerow(first_line)
|
||||
for line in i:
|
||||
output.writerow(clean_abstract(json.loads(line)))
|
||||
|
||||
|
||||
def clean_abstract(json_response):
|
||||
result = json_response['abstracts-retrieval-response']
|
||||
head = result['item']['bibrecord']['head']
|
||||
try:
|
||||
attributes = {
|
||||
'modal_country': get_country(head),
|
||||
'abstract' : get_abstract(result),
|
||||
'title' : get_title(result),
|
||||
'source_title': get_source_title(head),
|
||||
'language': result['language']['@xml:lang'],
|
||||
'first_ASJC_subject_area': get_subject(result, '$'),
|
||||
'first_ASJC_classification': get_subject(result, '@code'),
|
||||
'first_CPX_class': get_CPX_class(head, 'classification-description'),
|
||||
'date': to_date(result['coredata']['prism:coverDate']),
|
||||
'aggregation_type' : if_exists('prism:aggregationType',result['coredata'],else_val='NA'),
|
||||
'eid' : result['coredata']['eid'],
|
||||
'cited_by_count': result['coredata']['citedby-count'],
|
||||
'num_citations': get_citation_count(result)
|
||||
}
|
||||
except KeyError:
|
||||
raise
|
||||
except TypeError:
|
||||
# print(result)
|
||||
raise
|
||||
return attributes
|
||||
|
||||
def get_citation_count(result):
|
||||
try:
|
||||
return result['item']['bibrecord']['tail']['bibliography']['@refcount']
|
||||
except TypeError:
|
||||
return None
|
||||
|
||||
def get_title(result):
|
||||
try:
|
||||
return result['coredata']['dc:title']
|
||||
except KeyError:
|
||||
raise
|
||||
|
||||
|
||||
def get_source_title(head):
|
||||
try:
|
||||
return head['source']['sourcetitle']
|
||||
except KeyError:
|
||||
raise
|
||||
|
||||
def get_abstract(result):
|
||||
try:
|
||||
abstract = result['coredata']['dc:description']
|
||||
abstract = abstract.replace('\n',' ')
|
||||
return abstract
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
def get_auth_names(head):
|
||||
try:
|
||||
auth_info = [x['author'] for x in make_list(head['author-group'])]
|
||||
except KeyError:
|
||||
print(head)
|
||||
auth_names = []
|
||||
for auth_group in auth_info:
|
||||
for auth in make_list(auth_group):
|
||||
auth_names.append('{} {}'.format(
|
||||
auth['preferred-name']['ce:given-name'],
|
||||
auth['preferred-name']['ce:surname']))
|
||||
return auth_names
|
||||
|
||||
def get_country(head):
|
||||
all_countries = get_aff_info(head, 'country')
|
||||
if all_countries:
|
||||
# Find the mode. If there's more than one, choose randomly
|
||||
modes = Counter
|
||||
s = set(all_countries)
|
||||
max_count = max([all_countries.count(x) for x in s])
|
||||
modes = [x for x in s if all_countries.count(x) == max_count]
|
||||
return random.choice(modes)
|
||||
|
||||
def get_aff_info(head, affiliation_key):
|
||||
aff_info = []
|
||||
try:
|
||||
authors = make_list(head['author-group'])
|
||||
except KeyError:
|
||||
return None
|
||||
for x in authors:
|
||||
try:
|
||||
num_auth = len(make_list(x['author']))
|
||||
except KeyError:
|
||||
# Apparently there are things called "collaborations", which don't have affiliation info.
|
||||
# I'm just skipping them
|
||||
continue
|
||||
except TypeError:
|
||||
# And apparently "None" appears in the author list for no reason. :)
|
||||
continue
|
||||
try:
|
||||
curr_inst = x['affiliation'][affiliation_key]
|
||||
# Add one instance for each author from this institution
|
||||
aff_info += [curr_inst] * num_auth
|
||||
except KeyError:
|
||||
# If there isn't affiliation info for these authors, return empty str
|
||||
aff_info += [''] * num_auth
|
||||
return aff_info
|
||||
|
||||
def get_keywords(head):
|
||||
cite_info = head['citation-info']
|
||||
try:
|
||||
keywords = [x for x in
|
||||
make_list(cite_info['author-keywords']['author-keyword'])]
|
||||
# When there's only one keyword, it's a string. Otherwise, we will
|
||||
# have a list of dictionaries
|
||||
if len(keywords) == 1:
|
||||
return keywords
|
||||
else:
|
||||
return [x['$'] for x in keywords]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
def get_subject(result, key):
|
||||
try:
|
||||
return [x[key] for x in make_list(result['subject-areas']['subject-area'])][0]
|
||||
except KeyError:
|
||||
print(result)
|
||||
raise
|
||||
|
||||
def get_CPX_class(head, class_key):
|
||||
try:
|
||||
for x in head['enhancement']['classificationgroup']['classifications']:
|
||||
if x['@type'] == 'CPXCLASS':
|
||||
try:
|
||||
return [y[class_key] for y in make_list(x['classification'])][0]
|
||||
except (KeyError, TypeError):
|
||||
return None
|
||||
except KeyError:
|
||||
print(head['enhancement']['classificationgroup'])
|
||||
raise
|
||||
|
||||
def to_date(date_string):
|
||||
return datetime.strptime(date_string, '%Y-%m-%d')
|
||||
|
||||
|
||||
def if_exists(key, dictionary, else_val = None):
|
||||
try:
|
||||
return dictionary[key]
|
||||
except KeyError:
|
||||
return else_val
|
||||
|
||||
def make_list(list_or_dict):
|
||||
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
25
code/data_processing/01_cited_by_to_edgelist.py
Normal file
25
code/data_processing/01_cited_by_to_edgelist.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from datetime import datetime
|
||||
import json
|
||||
import argparse
|
||||
import csv
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Make a citation network from the cited_by json')
|
||||
parser.add_argument('-i', help='Cited_by file')
|
||||
parser.add_argument('-o', help='TSV output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as i:
|
||||
with open(args.o, 'w') as o:
|
||||
output = csv.writer(o, delimiter = '\t')
|
||||
output.writerow(['to','from', 'date'])
|
||||
for line in i:
|
||||
line = json.loads(line)
|
||||
output.writerow([line['parent_eid'], line['eid'], line['prism:coverDate']])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
29
code/data_processing/02_filter_edgelist.py
Normal file
29
code/data_processing/02_filter_edgelist.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import argparse
|
||||
import csv
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Take the edgelist, and reduce it to just the papers which are in our search')
|
||||
parser.add_argument('-i', help='Full edgelist file')
|
||||
parser.add_argument('-o', help='Edgelist output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as in_file:
|
||||
i = csv.reader(in_file, delimiter= '\t')
|
||||
next(i) # Discard header
|
||||
# Get the list of nodes to keep
|
||||
nodes = set([x[0] for x in i])
|
||||
in_file.seek(0) # Start over at the beginning
|
||||
with open(args.o, 'w') as o:
|
||||
output = csv.writer(o, delimiter = '\t')
|
||||
output.writerow(['to','from', 'date'])
|
||||
for line in i:
|
||||
# If the both items are in nodes, then keep the line
|
||||
if line[1] in nodes:
|
||||
output.writerow(line)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
62
code/data_processing/03_make_paper_aff_table.py
Normal file
62
code/data_processing/03_make_paper_aff_table.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import json
|
||||
import argparse
|
||||
import csv
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Generate paper to affiliation mapping file from abstracts file')
|
||||
parser.add_argument('-i', help='Abstract file')
|
||||
parser.add_argument('-o', help='TSV output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as i:
|
||||
with open(args.o, 'w') as o:
|
||||
output = csv.writer(o, delimiter='\t')
|
||||
output.writerow(['paper_eid','affiliation_id',
|
||||
'organization','country'])
|
||||
for line in i:
|
||||
entries = get_entries(line)
|
||||
for entry in entries:
|
||||
output.writerow(entry)
|
||||
|
||||
|
||||
def get_entries(l):
|
||||
json_response = json.loads(l)
|
||||
full = json_response['abstracts-retrieval-response']
|
||||
head = full['item']['bibrecord']['head']
|
||||
eid = full['coredata']['eid']
|
||||
countries = get_aff_info(head, 'country')
|
||||
affiliation_ids = get_aff_info(head, '@afid')
|
||||
org_names = get_aff_info(head, 'organization')
|
||||
if countries:
|
||||
result = [[eid, affiliation_ids[i], org_names[i], countries[i]]
|
||||
for i in range(len(countries))]
|
||||
return result
|
||||
return []
|
||||
|
||||
def get_aff_info(head, affiliation_key):
|
||||
aff_info = []
|
||||
try:
|
||||
affiliations = make_list(head['author-group'])
|
||||
except KeyError:
|
||||
return None
|
||||
for x in affiliations:
|
||||
if x is None:
|
||||
continue
|
||||
try:
|
||||
curr_inst = x['affiliation'][affiliation_key]
|
||||
# May return a string or a list. If it's a list, then
|
||||
# return the final value of that list (This is the base organization)
|
||||
if isinstance(curr_inst, list):
|
||||
curr_inst = [x['$'] for x in curr_inst][-1]
|
||||
aff_info.append(curr_inst)
|
||||
except KeyError:
|
||||
# If there isn't affiliation info for these authors, return empty str
|
||||
aff_info.append('')
|
||||
return aff_info
|
||||
|
||||
def make_list(list_or_dict):
|
||||
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
50
code/data_processing/04_make_paper_subject_table.py
Normal file
50
code/data_processing/04_make_paper_subject_table.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import json
|
||||
import argparse
|
||||
import csv
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Generate paper to subject mapping file from abstracts file')
|
||||
parser.add_argument('-i', help='Abstract file')
|
||||
parser.add_argument('-o', help='TSV output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as i:
|
||||
with open(args.o, 'w') as o:
|
||||
output = csv.writer(o, delimiter='\t')
|
||||
output.writerow(['paper_eid','subject',
|
||||
'subject_code'])
|
||||
for line in i:
|
||||
entries = get_entries(line)
|
||||
for entry in entries:
|
||||
output.writerow(entry)
|
||||
|
||||
|
||||
def get_entries(l):
|
||||
json_response = json.loads(l)
|
||||
full = json_response['abstracts-retrieval-response']
|
||||
eid = full['coredata']['eid']
|
||||
subjects = get_subjects(full)
|
||||
# Prepend the eid, and return the subjects
|
||||
return [[eid,s[0],s[1]] for s in subjects]
|
||||
return []
|
||||
|
||||
|
||||
def get_subjects(abstract_response):
|
||||
try:
|
||||
subject_info = make_list(abstract_response['subject-areas']['subject-area'])
|
||||
except KeyError:
|
||||
print(result)
|
||||
raise
|
||||
result = []
|
||||
for s in subject_info:
|
||||
# Get the subject name and code, and append them
|
||||
result.append([s['$'],s['@code']])
|
||||
return result
|
||||
|
||||
|
||||
def make_list(list_or_dict):
|
||||
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
17
code/data_processing/05_save_descriptives.R
Normal file
17
code/data_processing/05_save_descriptives.R
Normal file
@@ -0,0 +1,17 @@
|
||||
df = read.csv('processed_data/abstracts.tsv',sep='\t', strip.white=TRUE)
|
||||
df['date'] = as.Date(df$date)
|
||||
df$modal_country[df['modal_country'] == ''] <- NA
|
||||
df['year'] = format(df['date'],'%Y')
|
||||
|
||||
abstracts <- df[df['abstract'] != '',c('eid','abstract')]
|
||||
# Creates a vector of word counts, based on counting all of the groups of alphanumeric characters
|
||||
word_count <- apply(abstracts, 1, function(x) sapply(gregexpr("[[:alnum:]]+", x['abstract']), function(x) sum(x > 0)))
|
||||
|
||||
s = read.csv('processed_data/paper_subject_table.tsv', sep='\t')
|
||||
full <- merge(df,s, by.x = 'eid', by.y = 'paper_eid')
|
||||
|
||||
# zero these out before we save them so we don't save all of the abstracts.
|
||||
full['abstract'] <- NULL
|
||||
df['abstract'] <- NULL
|
||||
|
||||
save(df, abstracts, s, full, word_count, file="paper/data/orig_data_sets.RData")
|
||||
26
code/data_processing/make_network.py
Normal file
26
code/data_processing/make_network.py
Normal file
@@ -0,0 +1,26 @@
|
||||
'''Takes a CSV of retrieved articles, and creates an igraph
|
||||
network from them (not even close to done)'''
|
||||
|
||||
class CitationNetwork(igraph.Graph):
|
||||
def __init__(self, network_type):
|
||||
super().__init__(directed=True)
|
||||
self.temp_edges = []
|
||||
self.temp_vertices = []
|
||||
self.network_type = network_type
|
||||
|
||||
def add_vertices(self, to_node, from_nodes):
|
||||
self.temp_vertices += [[from_node, to_node] for from_node in from_nodes]
|
||||
|
||||
def make_network(self):
|
||||
# Get the unique set of nodes, and add them.
|
||||
nodes = set([v for v in self.temp_vertices if v['eid'] not in self.vs['name']])
|
||||
nodes = sorted(nodes)
|
||||
self.add_vertices(nodes)
|
||||
self.add_edges(self.temp_edges)
|
||||
self.es['weight'] = 1
|
||||
|
||||
def collapse_weights(self):
|
||||
self.simplify(combine_edges={"weight": "sum"})
|
||||
|
||||
def add_citations(eid, citations):
|
||||
self.retrieved_eids.append(eid)
|
||||
Reference in New Issue
Block a user