1
0

initial import of material for public archive into git

We're creating a fresh archive because the history for our old chapter includes
API keys, data files, and other material we can't share.
This commit is contained in:
2018-01-21 17:15:51 -08:00
commit dd420c77de
41 changed files with 7069 additions and 0 deletions

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,232 @@
# coding: utf-8
# # Import data and get things setup
import random
random.seed(9001)
# import code to write r modules and create our variable we'll write to
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = {}
def remember(name, x):
r[name] = x
# load in modules we'll need for analysis
import subprocess
import csv
from igraph import *
import pandas as pd
import numpy as np
import re
# grab the largest connected compontent with a little function
def get_largest_component(g):
g_components = g.components(mode="WEAK")
max_size = max(g_components.sizes())
for g_tmp in g_components.subgraphs():
if g_tmp.vcount() == max_size:
return(g_tmp)
# look the full edgelist into igraph
def edge_list_iter(df):
for i, row in df.iterrows():
yield (row['from'], row['to'])
# list top 5 journals for each of the clusters
def top_journals_for_clusters(clu):
articles_tmp = pd.merge(clu, articles[['eid', 'source_title']])
output = pd.DataFrame()
for cid in articles_tmp['cluster'].unique():
journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5)
tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts })
output = output.append(tmp)
output = output.reset_index()
output = output.rename(columns = {'index' : "journal"})
return(output)
def infomap_edgelist(g, edgelist_filename, directed=True):
nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index,
'eid' : v['name']} for v in g.vs ])
# write out the edgelist to an external file so we can call infomap on it
with open("code/bibliometrics/" + edgelist_filename + ".txt", 'w') as f:
for e in g.es:
if e.source != e.target:
if 'weight' in e.attributes():
print("{}\t{}\t{}".format(e.source, e.target, e['weight']), file=f)
else:
print("{}\t{}".format(e.source, e.target), file=f)
# run the external program to generate the infomap clustering
infomap_cmdline = ["code/bibliometrics/infomap/Infomap", "code/bibliometrics/" + edgelist_filename + ".txt", "code/bibliometrics/output_dir -z --map --clu --tree"]
if directed:
infomap_cmdline.append("-d")
subprocess.call(infomap_cmdline)
# load up the clu data
clu = pd.read_csv("code/bibliometrics/output_dir/" + edgelist_filename + ".clu",
header=None, comment="#", delim_whitespace=True)
clu.columns = ['node_infomap', 'cluster', 'flow']
return pd.merge(clu, nodes_tmp, on="node_infomap")
def write_graphml(g, clu, graphml_filename):
clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap')
g.vs["cluster"] = clu["cluster"].tolist()
g.write_graphml("code/bibliometrics/" + graphml_filename)
# load article data
articles = pd.read_csv("processed_data/abstracts.tsv", delimiter="\t")
# # network for just the central "social media" set
# this contains the list of all INCOMING citations to for paper in the original set
raw_edgelist = pd.read_csv("processed_data/social_media_edgelist.txt", delimiter="\t")
g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True)
g_sm = get_largest_component(g_sm_all)
g_sm = g_sm.simplify()
g_sm_clu = infomap_edgelist(g_sm, "sm_edgelist_infomap", directed=True)
g_sm_clu['cluster'].value_counts()
write_graphml(g_sm, g_sm_clu, "g_sm.graphml")
# # larger network that contains the incoming cites to citing articles
# this contains the list of all INCOMING citations to everything in the original set
# plus every INCOMING citation to every paper that cites one of those papers
raw_edgelist_files = ["processed_data/citation_edgelist.txt",
"processed_data/social_media_edgelist.txt"]
combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter="\t") for x in raw_edgelist_files])
g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True)
g_full = get_largest_component(g_full_all)
g_full = g_full.simplify()
g_full_clu = infomap_edgelist(g_full, "citation_edglist_infomap", directed=True)
g_full_clu['cluster'].value_counts()
top_journals_for_clusters(g_full_clu)
write_graphml(g_full, g_full_clu, "g_full.graphml")
# # create the meta-network of connections between clusters
edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[["eid", "cluster"]], how="inner", left_on="to", right_on="eid")
edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'})
edgelist_tmp.drop('eid', 1, inplace=True)
edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[["eid", "cluster"]], how="inner", left_on="from", right_on="eid")
edgelist_tmp = edgelist_tmp.rename(columns={"cluster" : 'from_cluster'})
edgelist_tmp.drop('eid', 1, inplace=True)
edgelist_tmp = edgelist_tmp[["to_cluster", "from_cluster"]]
edgelist_tmp = edgelist_tmp[edgelist_tmp["to_cluster"] != edgelist_tmp["from_cluster"]]
cluster_edgelist = pd.crosstab(edgelist_tmp["to_cluster"], edgelist_tmp["from_cluster"])
cluster_edgelist["to_cluster"] = cluster_edgelist.index
cluster_edgelist = pd.melt(cluster_edgelist, id_vars=["to_cluster"])
cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']]
remember("cluster_edgelist", cluster_edgelist)
top_clusters = g_sm_clu["cluster"].value_counts().head(6).index
# write the edgelist for the total number of clusters (currently 1-6)
cluster_edgelist_output = cluster_edgelist[(cluster_edgelist["to_cluster"].isin(top_clusters)) &
(cluster_edgelist["from_cluster"].isin(top_clusters))]
cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output["value"] > 0]
g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[["from_cluster", "to_cluster"]].values], directed=True)
g_cluster.es["weight"] = cluster_edgelist_output["value"].tolist()
# assign the number of total articles as an attribute for each node
g_cluster.vs["papers"] = g_sm_clu["cluster"].value_counts()[[x["name"] for x in g_cluster.vs]].tolist()
g_cluster.write_graphml("code/bibliometrics/clusters.graphml")
# # create network stats for tables (overall and within clusters)
def create_network_stats(g):
network_stats = pd.DataFrame({'eid' : g.vs['name'],
'eig_cent' : g.eigenvector_centrality(),
'indegree' : g.indegree(),
'betweenness' : g.betweenness()})
network_stats = pd.merge(network_stats,
articles[['eid', 'title', 'source_title']],
how="inner")
return network_stats
network_stats = create_network_stats(g_full)
network_stats.sort_values("indegree", ascending=False).head(4)
network_stats.sort_values("eig_cent", ascending=False).head(4)
network_stats.sort_values("betweenness", ascending=False).head(4)
# # things to store
remember('total_articles', articles.shape[0])
# total number of citations in the sm dataset
remember('sm_citations', raw_edgelist.shape[0])
remember('sm_citing', len(raw_edgelist["from"].unique()))
# the number of articles in the original dataset that have any INCOMING citations
remember('sm_cited', len(raw_edgelist["to"].unique()))
# total number of citations in the sm dataset
remember('all_citations', combo_raw_edgelist.shape[0])
remember('all_citing', len(combo_raw_edgelist["from"].unique()))
# the number of articles in the original dataset that have any INCOMING citations
remember('all_cited', len(combo_raw_edgelist["to"].unique()))
remember('g_sm_clusters', g_sm_clu[["eid", "cluster"]])
sorted(r.keys())
#save the r function to rdata file
def save_to_r(r_dict, filename="output.RData"):
for var_name, x in r.items():
var_name = var_name.replace('_', '.')
if type(x) == np.int64:
x = np.asscalar(x)
if type(x) == pd.DataFrame:
rx = pandas2ri.py2ri(x)
else:
rx = x
robjects.r.assign(var_name, x)
# create a new variable called in R
robjects.r("r <- sapply(ls(), function (x) {eval(parse(text=x))})")
robjects.r('save("r", file="{}")'.format(filename))
robjects.r("rm(list=ls())")
save_to_r(r, "paper/data/network_data.RData")

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,24 @@
import argparse
from request_functions import *
'''
This script takes in a search query and an output file. It queries the scopus API to find all papers that match the search query, and saves them to the output file.
Unlike some of the other scripts in this directory, it does not try to determine the state - if you restart the script, it will start over and blow away whatever you had saved before.
'''
years = range(2004, 2017)
def main():
parser = argparse.ArgumentParser(description='Output JSON of all articles matching search query')
parser.add_argument('-q', help='Search query', required=True)
parser.add_argument('-o', help='Where to append JSON results')
args = parser.parse_args()
with open(args.o, 'w') as out_file:
for year in years:
get_search_results(args.q, out_file, year=year)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,56 @@
from request_functions import *
import argparse
import json
import subprocess
def main():
parser = argparse.ArgumentParser(description='Output JSON of abstracts and bibliography of all articles passed in.')
parser.add_argument('-i', help='JSON file which includes eids')
parser.add_argument('--eid', '-e', help='Single eid')
parser.add_argument('-o', help='Where to append JSON results')
args = parser.parse_args()
if args.eid:
eids = [args.eid]
elif args.i:
with open(args.i, 'r') as f:
eids = [json.loads(line)['eid'] for line in f]
else:
print('Need to either pass in an eid or a json file with eids')
# If the script gets interrupted, we need to start where we left off
try:
errors = []
with open(args.o, 'r') as f:
completed_eids = []
for line in f:
try:
result = json.loads(line)
completed_eids.append(result['abstracts-retrieval-response']['coredata']['eid'])
except ValueError:
errors.append(line)
except IOError as e:
completed_eids = []
print('{} completed eids'.format(len(completed_eids)))
with open(args.o, 'a') as out_file:
for eid in eids:
if eid not in completed_eids:
result = get_abstract(eid)
if result:
out_file.write(result)
out_file.write('\n')
else:
errors.append(eid)
if len(errors) > 0:
with open('raw_data/missing_eids.json', 'a') as l:
# Add the bad lines from the output file
(l.write(e) for e in errors)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,43 @@
from request_functions import *
import argparse
import json
import subprocess
from os import remove
def main():
parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
parser.add_argument('-o', help='Where to append JSON results')
args = parser.parse_args()
with open(args.i, 'r') as f:
# Make a dictionary of eid:citation count for each line in the file
eids = {}
for line in f:
l = json.loads(line)
eids[l['eid']] = l['citedby-count']
# If the script gets interrupted, we need to start where we left off
try:
# Open the output file, and grab all of the eids which are already completed
with open(args.o, 'r') as f:
completed_eids = [json.loads(l)['parent_eid'] for l in f]
# Remove those which came from the last id (since we may have missed some)
if len(completed_eids) > 0:
last_eid = completed_eids.pop()
# Remove all of the lines which came from the last eid
subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
# Hopefully everything has worked out, because here we blow away the backup
remove('{}.bak'.format(args.o))
except IOError:
# If the file doesn't exist, then there aren't any completed eids
completed_eids = []
with open(args.o, 'a') as out_file:
for eid, citation_count in eids.items():
if citation_count != '0' and eid not in completed_eids:
get_cited_by(eid, out_file)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,166 @@
import requests
from datetime import datetime
from scopus_api import key as API_KEY
import json
import os
import logging
import re
logging.basicConfig(level=logging.DEBUG)
RETRY_COUNT = 5
TIMEOUT_SECS = 10
# Initialize a global session object
s = requests.Session()
s.headers.update({'X-ELS-APIKey' : API_KEY,
'X-ELS-ResourceVersion' : 'XOCS',
'Accept' : 'application/json'})
def get_token(location_id = None):
'''Given a location_id, gets an authentication token'''
print('Getting a token')
api_resource = 'http://api.elsevier.com/authenticate'
# Parameters
payload = {'platform':'SCOPUS',
'choice': location_id}
r = s.get(api_resource, params = payload)
r.raise_for_status()
s.headers['X-ELS-AuthToken'] = r.json()['authenticate-response']['authtoken']
def get_search_results(query, output_file, results_per_call = 200,
tot_results=None, year=None, sort='+title', citation_call=False):
'''Handles getting search results. Takes a query and an output
file. Writes as many of the search results as possible to the
output file as JSON dictionaries, one per line.'''
result_set = []
results_added = 0
def curr_call(start=0, count=results_per_call):
'''Shorthand for the current call: DRY'''
return make_search_call(query, start=start,
count=count, year=year, sort=sort)
if tot_results == None:
# Call the API initially to figure out how many results there are, and write the results
initial_results = curr_call(count=results_per_call)
tot_results = int(initial_results['search-results']['opensearch:totalResults'])
result_set.append((initial_results, sort))
results_added += results_per_call
logging.debug("Total results: {}".format(tot_results))
if tot_results == 0:
return None
if tot_results > 5000:
# If this is just one year, we can't get any more granular, and
# we need to return what we can.
if tot_results > 10000:
print("{} results for {}. We can only retrieve 10,000".format(tot_results, year))
first_half = last_half = 5000
else:
# Get half, and correct for odd # of results
first_half = tot_results//2 + tot_results % 2
last_half = tot_results//2
# Break the search into the first half and the bottom half of results.
get_search_results(query, output_file,
year = year,
tot_results=first_half)
# Get the other half
get_search_results(query, output_file,
year = year,
tot_results = last_half, sort='-title')
# If there are 5000 or fewer to retrieve, then get them
else:
logging.debug('Retrieving {} results'.format(tot_results))
# As long as there are more citations to retrieve, then do it, and write
# them to the file
while results_added < tot_results:
# If we are near the end, then only get as many results as are left.
to_retrieve = min(results_per_call, (tot_results - results_added))
curr_results = curr_call(start=results_added, count=to_retrieve)
result_set.append((curr_results, sort))
results_added += results_per_call
# This is hacky, but I'm doing it
# If this is a citation call, then construct metadata to be written with the result
if citation_call:
metadata = {'parent_eid': re.match(r'refeid\((.*)\)', query).group(1)}
else:
metadata = {}
write_results(result_set, output_file, metadata)
def write_results(result_set, output_file, metadata={}):
for x in result_set:
search_json = x[0]
to_reverse = x[1].startswith('-')
try:
results = [x for x in search_json['search-results']['entry']]
except KeyError:
raise
if to_reverse:
results = results[::-1]
for x in results:
for k, v in metadata.items():
x[k] = v
json.dump(x, output_file)
output_file.write('\n')
def make_search_call(query, start=0, count=200,
sort='+title', year=None,
retry_limit = RETRY_COUNT,
timeout_secs = TIMEOUT_SECS):
api_resource = "https://api.elsevier.com/content/search/scopus"
# Parameters
payload = {'query':query,
'count':count,
'start':start,
'sort': sort,
'date': year}
for _ in range(retry_limit):
try:
r = s.get(api_resource,
params = payload,
timeout = timeout_secs)
logging.debug(r.url)
if r.status_code == 401:
get_token()
continue
if r.status_code == 400:
raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
break
except requests.exceptions.Timeout:
pass
else:
raise requests.exceptions.Timeout('Timeout Error')
r.raise_for_status()
return r.json()
def get_cited_by(eid, output_file):
return get_search_results('refeid({})'.format(eid), output_file, results_per_call=200,
citation_call = True)
def get_abstract(eid, retry_limit = RETRY_COUNT,
timeout_secs = TIMEOUT_SECS):
api_resource = "http://api.elsevier.com/content/abstract/eid/{}".format(eid)
# Parameters
payload = {}
for _ in range(retry_limit):
try:
r = s.get(api_resource,
params = payload,
timeout = timeout_secs)
if r.status_code == 401:
get_token()
continue
if r.status_code == 400:
raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
break
except requests.exceptions.Timeout:
pass
else:
raise requests.exceptions.Timeout('Timeout Error')
if r.status_code == 404:
return None
r.raise_for_status()
return r.content.decode('utf-8')

View File

@@ -0,0 +1 @@
key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

View File

@@ -0,0 +1,177 @@
from collections import Counter
from datetime import datetime
import json
import argparse
import csv
import random
random.seed(2017)
def main():
parser = argparse.ArgumentParser(description='Change a big ugly abstract file to a nice CSV')
parser.add_argument('-i', help='Abstract file')
parser.add_argument('-o', help='TSV output file')
args = parser.parse_args()
with open(args.i, 'r') as i:
with open(args.o, 'w') as o:
# Have to get the field names
first_line = clean_abstract(json.loads(next(i)))
fieldnames = first_line.keys()
output = csv.DictWriter(o, fieldnames, delimiter='\t')
output.writeheader()
output.writerow(first_line)
for line in i:
output.writerow(clean_abstract(json.loads(line)))
def clean_abstract(json_response):
result = json_response['abstracts-retrieval-response']
head = result['item']['bibrecord']['head']
try:
attributes = {
'modal_country': get_country(head),
'abstract' : get_abstract(result),
'title' : get_title(result),
'source_title': get_source_title(head),
'language': result['language']['@xml:lang'],
'first_ASJC_subject_area': get_subject(result, '$'),
'first_ASJC_classification': get_subject(result, '@code'),
'first_CPX_class': get_CPX_class(head, 'classification-description'),
'date': to_date(result['coredata']['prism:coverDate']),
'aggregation_type' : if_exists('prism:aggregationType',result['coredata'],else_val='NA'),
'eid' : result['coredata']['eid'],
'cited_by_count': result['coredata']['citedby-count'],
'num_citations': get_citation_count(result)
}
except KeyError:
raise
except TypeError:
# print(result)
raise
return attributes
def get_citation_count(result):
try:
return result['item']['bibrecord']['tail']['bibliography']['@refcount']
except TypeError:
return None
def get_title(result):
try:
return result['coredata']['dc:title']
except KeyError:
raise
def get_source_title(head):
try:
return head['source']['sourcetitle']
except KeyError:
raise
def get_abstract(result):
try:
abstract = result['coredata']['dc:description']
abstract = abstract.replace('\n',' ')
return abstract
except KeyError:
return None
def get_auth_names(head):
try:
auth_info = [x['author'] for x in make_list(head['author-group'])]
except KeyError:
print(head)
auth_names = []
for auth_group in auth_info:
for auth in make_list(auth_group):
auth_names.append('{} {}'.format(
auth['preferred-name']['ce:given-name'],
auth['preferred-name']['ce:surname']))
return auth_names
def get_country(head):
all_countries = get_aff_info(head, 'country')
if all_countries:
# Find the mode. If there's more than one, choose randomly
modes = Counter
s = set(all_countries)
max_count = max([all_countries.count(x) for x in s])
modes = [x for x in s if all_countries.count(x) == max_count]
return random.choice(modes)
def get_aff_info(head, affiliation_key):
aff_info = []
try:
authors = make_list(head['author-group'])
except KeyError:
return None
for x in authors:
try:
num_auth = len(make_list(x['author']))
except KeyError:
# Apparently there are things called "collaborations", which don't have affiliation info.
# I'm just skipping them
continue
except TypeError:
# And apparently "None" appears in the author list for no reason. :)
continue
try:
curr_inst = x['affiliation'][affiliation_key]
# Add one instance for each author from this institution
aff_info += [curr_inst] * num_auth
except KeyError:
# If there isn't affiliation info for these authors, return empty str
aff_info += [''] * num_auth
return aff_info
def get_keywords(head):
cite_info = head['citation-info']
try:
keywords = [x for x in
make_list(cite_info['author-keywords']['author-keyword'])]
# When there's only one keyword, it's a string. Otherwise, we will
# have a list of dictionaries
if len(keywords) == 1:
return keywords
else:
return [x['$'] for x in keywords]
except KeyError:
return None
def get_subject(result, key):
try:
return [x[key] for x in make_list(result['subject-areas']['subject-area'])][0]
except KeyError:
print(result)
raise
def get_CPX_class(head, class_key):
try:
for x in head['enhancement']['classificationgroup']['classifications']:
if x['@type'] == 'CPXCLASS':
try:
return [y[class_key] for y in make_list(x['classification'])][0]
except (KeyError, TypeError):
return None
except KeyError:
print(head['enhancement']['classificationgroup'])
raise
def to_date(date_string):
return datetime.strptime(date_string, '%Y-%m-%d')
def if_exists(key, dictionary, else_val = None):
try:
return dictionary[key]
except KeyError:
return else_val
def make_list(list_or_dict):
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,25 @@
from datetime import datetime
import json
import argparse
import csv
def main():
parser = argparse.ArgumentParser(description='Make a citation network from the cited_by json')
parser.add_argument('-i', help='Cited_by file')
parser.add_argument('-o', help='TSV output file')
args = parser.parse_args()
with open(args.i, 'r') as i:
with open(args.o, 'w') as o:
output = csv.writer(o, delimiter = '\t')
output.writerow(['to','from', 'date'])
for line in i:
line = json.loads(line)
output.writerow([line['parent_eid'], line['eid'], line['prism:coverDate']])
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,29 @@
import argparse
import csv
def main():
parser = argparse.ArgumentParser(description='Take the edgelist, and reduce it to just the papers which are in our search')
parser.add_argument('-i', help='Full edgelist file')
parser.add_argument('-o', help='Edgelist output file')
args = parser.parse_args()
with open(args.i, 'r') as in_file:
i = csv.reader(in_file, delimiter= '\t')
next(i) # Discard header
# Get the list of nodes to keep
nodes = set([x[0] for x in i])
in_file.seek(0) # Start over at the beginning
with open(args.o, 'w') as o:
output = csv.writer(o, delimiter = '\t')
output.writerow(['to','from', 'date'])
for line in i:
# If the both items are in nodes, then keep the line
if line[1] in nodes:
output.writerow(line)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,62 @@
import json
import argparse
import csv
def main():
parser = argparse.ArgumentParser(description='Generate paper to affiliation mapping file from abstracts file')
parser.add_argument('-i', help='Abstract file')
parser.add_argument('-o', help='TSV output file')
args = parser.parse_args()
with open(args.i, 'r') as i:
with open(args.o, 'w') as o:
output = csv.writer(o, delimiter='\t')
output.writerow(['paper_eid','affiliation_id',
'organization','country'])
for line in i:
entries = get_entries(line)
for entry in entries:
output.writerow(entry)
def get_entries(l):
json_response = json.loads(l)
full = json_response['abstracts-retrieval-response']
head = full['item']['bibrecord']['head']
eid = full['coredata']['eid']
countries = get_aff_info(head, 'country')
affiliation_ids = get_aff_info(head, '@afid')
org_names = get_aff_info(head, 'organization')
if countries:
result = [[eid, affiliation_ids[i], org_names[i], countries[i]]
for i in range(len(countries))]
return result
return []
def get_aff_info(head, affiliation_key):
aff_info = []
try:
affiliations = make_list(head['author-group'])
except KeyError:
return None
for x in affiliations:
if x is None:
continue
try:
curr_inst = x['affiliation'][affiliation_key]
# May return a string or a list. If it's a list, then
# return the final value of that list (This is the base organization)
if isinstance(curr_inst, list):
curr_inst = [x['$'] for x in curr_inst][-1]
aff_info.append(curr_inst)
except KeyError:
# If there isn't affiliation info for these authors, return empty str
aff_info.append('')
return aff_info
def make_list(list_or_dict):
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,50 @@
import json
import argparse
import csv
def main():
parser = argparse.ArgumentParser(description='Generate paper to subject mapping file from abstracts file')
parser.add_argument('-i', help='Abstract file')
parser.add_argument('-o', help='TSV output file')
args = parser.parse_args()
with open(args.i, 'r') as i:
with open(args.o, 'w') as o:
output = csv.writer(o, delimiter='\t')
output.writerow(['paper_eid','subject',
'subject_code'])
for line in i:
entries = get_entries(line)
for entry in entries:
output.writerow(entry)
def get_entries(l):
json_response = json.loads(l)
full = json_response['abstracts-retrieval-response']
eid = full['coredata']['eid']
subjects = get_subjects(full)
# Prepend the eid, and return the subjects
return [[eid,s[0],s[1]] for s in subjects]
return []
def get_subjects(abstract_response):
try:
subject_info = make_list(abstract_response['subject-areas']['subject-area'])
except KeyError:
print(result)
raise
result = []
for s in subject_info:
# Get the subject name and code, and append them
result.append([s['$'],s['@code']])
return result
def make_list(list_or_dict):
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,17 @@
df = read.csv('processed_data/abstracts.tsv',sep='\t', strip.white=TRUE)
df['date'] = as.Date(df$date)
df$modal_country[df['modal_country'] == ''] <- NA
df['year'] = format(df['date'],'%Y')
abstracts <- df[df['abstract'] != '',c('eid','abstract')]
# Creates a vector of word counts, based on counting all of the groups of alphanumeric characters
word_count <- apply(abstracts, 1, function(x) sapply(gregexpr("[[:alnum:]]+", x['abstract']), function(x) sum(x > 0)))
s = read.csv('processed_data/paper_subject_table.tsv', sep='\t')
full <- merge(df,s, by.x = 'eid', by.y = 'paper_eid')
# zero these out before we save them so we don't save all of the abstracts.
full['abstract'] <- NULL
df['abstract'] <- NULL
save(df, abstracts, s, full, word_count, file="paper/data/orig_data_sets.RData")

View File

@@ -0,0 +1,26 @@
'''Takes a CSV of retrieved articles, and creates an igraph
network from them (not even close to done)'''
class CitationNetwork(igraph.Graph):
def __init__(self, network_type):
super().__init__(directed=True)
self.temp_edges = []
self.temp_vertices = []
self.network_type = network_type
def add_vertices(self, to_node, from_nodes):
self.temp_vertices += [[from_node, to_node] for from_node in from_nodes]
def make_network(self):
# Get the unique set of nodes, and add them.
nodes = set([v for v in self.temp_vertices if v['eid'] not in self.vs['name']])
nodes = sorted(nodes)
self.add_vertices(nodes)
self.add_edges(self.temp_edges)
self.es['weight'] = 1
def collapse_weights(self):
self.simplify(combine_edges={"weight": "sum"})
def add_citations(eid, citations):
self.retrieved_eids.append(eid)

View File

@@ -0,0 +1,89 @@
from time import time
from sklearn.feature_extraction.text import CountVectorizer
import csv
import argparse
n_features = 100000 # Gets the top n_features terms
n_samples = None # Enter an integer here for testing, so it doesn't take so long
def main():
parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts')
parser.add_argument('-i', help='Location of the abstracts file',
default='processed_data/abstracts.tsv')
parser.add_argument('-o', help='Location of the output file',
default='processed_data/ngram_table.csv')
parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams',
default=3)
args = parser.parse_args()
print("Loading dataset...")
t0 = time()
doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples)
print("done in %0.3fs." % (time() - t0))
# Write the header
write_header(args.o)
bags_o_words = get_counts(data_samples, n_features, args.n)
write_output(doc_ids, bags_o_words, args.o)
def get_counts(abstracts, n_features, ngram_max):
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
max_features=n_features,
stop_words='english',
ngram_range = (1,ngram_max))
t0 = time()
tf = tf_vectorizer.fit_transform(abstracts)
print("done in %0.3fs." % (time() - t0))
terms = tf_vectorizer.get_feature_names()
freqs = tf.toarray()
bags_o_words = to_bags_o_words(terms, freqs)
return bags_o_words
def write_header(out_file):
with open(out_file, 'w') as o_f:
out = csv.writer(o_f)
out.writerow(['document_id','term','frequency'])
def to_bags_o_words(terms, freqs):
'''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document.
The format of the dictionaries is term:count within that document.
'''
result = []
for d in freqs:
curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 }
result.append(curr_result)
return result
def write_output(ids, bags_o_words, out_file):
with open(out_file, 'a') as o_f:
out = csv.writer(o_f)
for i, doc in enumerate(bags_o_words):
for k,v in doc.items():
# For each term and count, output a row, together with the document id
out.writerow([ids[i],k,v])
def get_ids_and_abstracts(fn, length_limit):
with open(fn, 'r') as f:
in_csv = csv.DictReader(f, delimiter='\t')
abstracts = []
ids = []
i = 1
for r in in_csv:
try:
abstracts.append(r['abstract'])
ids.append(r['eid'])
except KeyError:
print(r)
if length_limit and i > length_limit:
break
i += 1
return ids, abstracts
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,89 @@
source("code/prediction/utils.R")
# use this to store things for use in the paper
pred.descrip <- NULL
abstracts <- read.delim("processed_data/abstracts.tsv", header=TRUE,
stringsAsFactors=FALSE, sep="\t")
abstracts <- subset(abstracts, select = -abstract)
abstracts <- abstracts[abstracts$aggregation_type != "Trade Journal" &
is.na(abstracts$aggregation_type) == FALSE, ]
names(abstracts)[names(abstracts) == 'num_citations'] <- 'works_cited'
abstracts$works_cited[is.na(abstracts$works_cited) == TRUE] <- 0
# affiliations
affiliations <- read.delim("processed_data/paper_aff_table.tsv",
header=TRUE, stringsAsFactors=FALSE,
sep="\t")
# eliminate missing values
affiliations <- affiliations[!is.na(affiliations$affiliation_id) &
affiliations$organization != "", ]
remap.affiliations <- function(aff.id,
aff.df = affiliations){
org.modal <- names(tail(sort(table(affiliations$organization[
affiliations$affiliation_id == aff.id])),1))
return(org.modal)
}
affiliations$organization <- sapply(affiliations$affiliation_id, remap.affiliations)
affiliations <- subset(affiliations, select = c(paper_eid,
organization))
names(affiliations) <- c("eid", "affiliation")
# need to remove repeat affiliations
affiliations <- affiliations[duplicated(affiliations$eid) == FALSE,]
######################################
d <- abstracts[, c("eid", "language", "modal_country",
"source_title", "works_cited")]
# dichotomous dependent variable
d$cited <- abstracts$cited_by_count > 0
# store this here for use in the paper before we run any restrictions:
pred.descrip$cited <- d$cited
pred.descrip$cites <- abstracts$cited_by_count
# We want these to be categorical variables
d$modal_country <- factor(d$modal_country)
d$language <- factor(d$language)
d$subject <- factor(abstracts$first_ASJC_subject_area)
d$source_title <- factor(d$source_title)
d$month <- factor(strftime(abstracts$date, format= "%m"))
# except for pub year - keep that continuous
d$year <- as.numeric(strftime(abstracts$date, format="%Y"))
# bring in org affiliations
d <- merge(d, affiliations, by="eid") # note that this drops papers
# w/out org info
d$affiliation <- factor(d$affiliation)
##### Restrictions:
### do this explicitly so that changes are easy:
d <- restrict(d, d$affiliation, 1)
d <- restrict(d, d$subject, 1)
d <- restrict(d, d$source_title, 1)
d <- restrict(d, d$language, 1)
d <- restrict(d, d$modal_country, 1)
# n.authors
# per author prior citations
pred.descrip$covars <- d
save(pred.descrip, file = "paper/data/prediction_descriptives.RData")
rm(d, abstracts, affiliations)

View File

@@ -0,0 +1,56 @@
library(data.table)
# import ngram data
# note that the file is not pushed to repository, but is available on
# hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
# Top 100,000 ngrams (?)
ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
header=TRUE, stringsAsFactors=FALSE)[,-3]
names(ngrams)[1] <- "eid"
subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
stringsAsFactors=FALSE, sep="\t")[,c("eid",
"first_ASJC_subject_area")]
names(subjects)[2] <- "subject"
# takes a couple of minutes:
ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
# only use ngrams that occur accross all (many?) subject areas
subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
length(unique(x)))
# summary(subject.by.ngram)
#
# library(txtplot)
# txtdensity(log(subject.by.ngram))
# Note:
# The median number of subject areas per term is five. We'll cut it
# off at terms that occur across at least 30 subject areas.
top.ngrams <- ngrams[ngrams$term %in%
names(subject.by.ngram[subject.by.ngram >
30]),c("eid", "term")]
rm(ngrams, subject.by.ngram, subjects)
# convert to a wide format matrix of dichotomous variables
library(reshape2)
library(data.table)
top.ngrams <- data.table(top.ngrams)
setkey(top.ngrams, eid)
top.ngrams[,vv:= TRUE]
# took more than 20 minutes on hyak
top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
value.var = "vv")
rm(top.ngrams)
save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
#load("processed_data/top.ngram.matrix.RData")

View File

@@ -0,0 +1,221 @@
library(data.table)
library(Matrix)
library(glmnet)
library(xtable)
library(methods)
predict.list <- NULL
if(!exists("top.ngram.matrix")){
load("processed_data/top.ngram.matrix.RData")
}
if(!exists("pred.descrip")){
load("paper/data/prediction_descriptives.RData")
covars <- pred.descrip$covars
}
top.ngram.matrix <- data.table(top.ngram.matrix)
setkey(top.ngram.matrix, eid)
covars <- data.table(pred.descrip$covars)
setkey(covars,eid)
# restrict to the overlap of the two datasets
covars <- covars[covars$eid %in% top.ngram.matrix$eid,]
top.ngram.matrix <- top.ngram.matrix[top.ngram.matrix$eid %in%
covars$eid,]
# rename the cited column in case it doesn't appear
names(covars)[names(covars) == 'cited'] <- 'cited.x'
# then merge also to facilitate some manipulations below
d <- merge(covars, top.ngram.matrix, by="eid", all=FALSE)
# Note that this duplicates some column names so X gets appended in a
# few cases.
# construct model matrices
x.controls <- sparse.model.matrix(cited.x ~ language.x +
modal_country + month.x,
data=d)[,-1]
x.aff <- sparse.model.matrix(cited.x ~ affiliation, data=d)[,-1]
x.subj <- sparse.model.matrix(cited.x ~ subject.x, data=d)[,-1]
x.venue <- sparse.model.matrix(cited.x ~ source_title, data=d)[,-1]
x.ngrams <- as.matrix(subset(top.ngram.matrix, select=-eid))
x.ngrams <- as(x.ngrams, "sparseMatrix")
X <- cBind(x.controls, covars$year.x, covars$works.cited)
X.aff <- cBind(X, x.aff)
X.subj <- cBind(X.aff, x.subj)
X.venue <- cBind(X.subj, x.venue)
X.terms <- cBind(X.venue, x.ngrams)
Y <- covars$cited
### Hold-back sample for testing model performance later on:
set.seed(20160719)
holdback.index <- sample(nrow(X), round(nrow(X)*.1))
X.hold <- X[holdback.index,]
X.hold.aff <- X.aff[holdback.index,]
X.hold.subj <- X.subj[holdback.index,]
X.hold.venue <- X.venue[holdback.index,]
X.hold.terms <- X.terms[holdback.index,]
Y.hold <- Y[holdback.index]
X.test <- X[-holdback.index,]
X.test.aff <- X.aff[-holdback.index,]
X.test.subj <- X.subj[-holdback.index,]
X.test.venue <- X.venue[-holdback.index,]
X.test.terms <- X.terms[-holdback.index,]
Y.test <- Y[-holdback.index]
############### Models and prediction
set.seed(20160719)
m.con <- cv.glmnet(X.test, Y.test, alpha=1, family="binomial",
type.measure="class")
con.pred = predict(m.con, type="class", s="lambda.min",
newx=X.hold)
m.aff <- cv.glmnet(X.test.aff, Y.test, alpha=1, family="binomial",
type.measure="class")
aff.pred = predict(m.aff, type="class", s="lambda.min",
newx=X.hold.aff)
m.subj <- cv.glmnet(X.test.subj, Y.test, alpha=1, family="binomial",
type.measure="class")
subj.pred = predict(m.subj, type="class", s="lambda.min",
newx=X.hold.subj)
m.venue <- cv.glmnet(X.test.venue, Y.test, alpha=1, family="binomial",
type.measure="class")
venue.pred = predict(m.venue, type="class", s="lambda.min",
newx=X.hold.venue)
m.terms <- cv.glmnet(X.test.terms, Y.test, alpha=1, family="binomial",
type.measure="class")
terms.pred = predict(m.terms, type="class", s="lambda.min",
newx=X.hold.terms)
##########
# Compare test set predictions against held-back sample:
pred.df <- data.frame(cbind(con.pred, aff.pred, subj.pred,
venue.pred, terms.pred))
names(pred.df) <- c("Controls", "+ Affiliation", "+ Subject", "+ Venue",
"+ Terms")
m.list <- list(m.con, m.aff, m.subj, m.venue, m.terms)
# collect:
# df
# percent.deviance
# nonzero coefficients
# prediction error
gen.m.summ.info <- function(model){
df <- round(tail(model$glmnet.fit$df, 1),0)
percent.dev <- round(tail(model$glmnet.fit$dev.ratio, 1),2)*100
cv.error <- round(tail(model$cvm,1),2)*100
# null.dev <- round(tail(model$glmnet.fit$nulldev),0)
out <- c(df, percent.dev, cv.error)
return(out)
}
gen.class.err <- function(pred, test){
props <- prop.table(table(pred, test))
err.sum <- round(sum(props[1,2], props[2,1]),2)*100
return(err.sum)
}
results.tab <- cbind(names(pred.df),data.frame(matrix(unlist(lapply(m.list,
gen.m.summ.info)),
byrow=T, nrow=5)))
results.tab$class.err <- sapply(pred.df, function(x) gen.class.err(x,
Y.hold))
results.tab <- data.frame(lapply(results.tab, as.character))
names(results.tab) <- c("Model", "N features", "Deviance (%)",
"CV error (%)", "Hold-back error (%)")
print(xtable(results.tab,
caption=
"Summary of fitted models predicting any citations. The ``Model'' column describes which features were included. The N features column shows the number of features included in the prediction. ``Deviance'' summarizes the goodness of fit as a percentage of the total deviance accounted for by the model. ``CV error'' (cross-validation error) reports the prediction error rates of each model in the cross-validation procedure conducted as part of the parameter estimation process. ``Hold-back error'' shows the prediction error on a random 10 percent subset of the original dataset not included in any of the model estimation procedures.",
label='tab:predict_models', align='llrrrr'),
include.rownames=FALSE)
# Store the results:
predict.list$results.tab <- results.tab
############# Generate most salient coefficients
nz.coefs <- data.frame( coef =
colnames(X.test.terms)[which(
coef(m.terms, s="lambda.min")
!= 0)],
type = "term",
beta =
coef(m.terms,
s="lambda.min")[which(coef(m.terms,
s="lambda.min")
!= 0)])
nz.coefs$coef <- as.character(nz.coefs$coef)
nz.coefs$type <- as.character(nz.coefs$type)
nz.coefs <- nz.coefs[order(-abs(nz.coefs$beta)),]
# comparison:
#nz.coefs$type <- "terms"
nz.coefs$type[grepl("(Intercept)", nz.coefs$coef)] <- NA
nz.coefs$type[grepl("source_title", nz.coefs$coef)] <- "venue"
nz.coefs$type[grepl("subject.x", nz.coefs$coef)] <- "subject"
nz.coefs$type[grepl("affiliation", nz.coefs$coef)] <- "affiliation"
nz.coefs$type[grepl("month.x", nz.coefs$coef)] <- "month"
nz.coefs$type[grepl("modal_country", nz.coefs$coef)] <- "country"
nz.coefs$type[grepl("language", nz.coefs$coef)] <- "language"
nz.coefs$type[grepl("^20[0-9]{2}$", nz.coefs$coef)] <- "year"
# cleanup
nz.coefs$coef <- gsub("source_title", "", nz.coefs$coef)
nz.coefs$coef <- gsub("subject.x", "", nz.coefs$coef)
nz.coefs$coef <- gsub("affiliation","", nz.coefs$coef)
nz.coefs$beta <- round(nz.coefs$beta, 3)
names(nz.coefs) <- c("Feature", "Type", "Coefficient")
predict.list$nz.coefs <- nz.coefs
# table for all
round(prop.table(table(nz.coefs$Type))*100, 2)
# for top subsets
round(prop.table(table(nz.coefs$Type[1:700]))*100, 2)
round(prop.table(table(nz.coefs$Type[1:200]))*100, 2)
round(prop.table(table(nz.coefs$Type[1:100]))*100, 2)
print(xtable(
as.matrix(head(nz.coefs, 10)),
label='tab:nzcoefs',
caption='Feature, variable type, and beta value for top 100 non-zero coefficients estimated by the best fitting model with all features included.',
align='lllr'
), include.rownames=FALSE)
# output
save(predict.list, file="paper/data/prediction.RData")

13
code/prediction/utils.R Normal file
View File

@@ -0,0 +1,13 @@
# Use this to check for underpopulated cells
gen.counts <- function(df, c.var){
tapply(df[,"eid"], c.var, function(x) length(unique(x)))
}
# use this to remove underpopulated cells
restrict <- function(df, c.var, c.min){
var.counts <- gen.counts(df, c.var)
out.df <- df[c.var %in% names(var.counts[var.counts >
c.min]),]
return(out.df)
}

View File

@@ -0,0 +1,126 @@
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import sys
import csv
import pandas as pd
import argparse
"""
This code was inspired/copied from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html.
It takes in an abstract file, and creates two outputs: The abstracts together with their topic distribution and a set of topics and the top words associated with each.
"""
n_samples = None # Enter an integer here for testing.
n_features = 20000
n_topics = 12
def main():
parser = argparse.ArgumentParser(description='Program to use LDA to create topics and topic distributions from a set of abstracts.')
parser.add_argument('-i', help='Abstracts file',
default='processed_data/abstracts.tsv')
parser.add_argument('-o', help='Where to output results',
default='processed_data/abstracts_LDA.csv')
parser.add_argument('-t', help='Where to output topics and top words associated with them',
default='processed_data/top_words.csv')
args = parser.parse_args()
print("Loading dataset...")
t0 = time()
dataset, doc_data = get_abstracts(args.i)
data_samples = dataset[:n_samples]
doc_data = doc_data[:n_samples]
print("done in %0.3fs." % (time() - t0))
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, # Terms that show up in > max_df of documents are ignored
min_df=2, # Terms that show up in < min_df of documents are ignored
max_features=n_features, # Only use the top max_features
stop_words='english',
ngram_range=(1,2))
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print("Fitting LDA models with tf features, "
"n_samples=%d and n_features=%d..."
% (len(data_samples), n_features))
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=2017,
n_jobs=2)
t0 = time()
model = lda.fit(tf)
transformed_model = lda.fit_transform(tf)
print("done in %0.3fs." % (time() - t0))
# Change the values into a probability distribution for each abstract
topic_dist = [[topic/sum(abstract_topics) for topic in abstract_topics]
for abstract_topics in transformed_model]
# Make the topic distribution into a dataframe
td = pd.DataFrame(topic_dist)
# Get the feature names (i.e., the words/terms)
tf_feature_names = tf_vectorizer.get_feature_names()
# Get the top words by topic
topic_words = get_top_words(lda, tf_feature_names, 20)
# Sort by how often topic is used
topic_words = topic_words.reindex_axis(sorted(topic_words.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
# Rearrange the columns by how often each topic is used
td = td.reindex_axis(sorted(td.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
topic_words.to_csv(args.t, index=False)
df = pd.DataFrame(doc_data)
df = df.join(td)
df.to_csv(args.o, index=False)
def get_abstracts(fn):
with open(fn, 'r') as f:
in_csv = csv.DictReader(f, delimiter='\t')
abstracts = []
doc_data = []
for r in in_csv:
try:
curr_abstract = r['abstract']
# If this isn't really an abstract, then don't add it
if len(curr_abstract) > 5:
# Add the abstracts to the corpus, and save the data
abstracts.append(r['abstract'])
doc_data.append(r)
except KeyError:
print(r)
return abstracts, doc_data
def get_top_words(model, feature_names, n_top_words):
'''Takes the model, the words used, and the number of words requested.
Returns a dataframe of the top n_top_words for each topic'''
r = pd.DataFrame()
# For each topic
for i, topic in enumerate(model.components_):
# Get the top feature names, and put them in that column
r[i] = [add_quotes(feature_names[i])
for i in topic.argsort()[:-n_top_words - 1:-1]]
return r
def add_quotes(s):
'''Adds quotes around multiple term phrases'''
if " " in s:
s = '"{}"'.format(s)
return s
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,103 @@
'''Creates the figures and tables for LaTeX'''
import pandas as pd
import numpy as np
import datetime
import argparse
import os
topic_names = [
'Media Use',
'Social Network Analysis',
'Consumer Analsyis',
'Education',
'Quantitative Analysis',
'Information Spread',
'Health',
'Sentiment Analysis',
'News',
'HCI',
'Influence',
'Methodology'
]
def main():
parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics')
parser.add_argument('-a', help='Abstracts LDA file',
default='processed_data/abstracts_LDA.csv')
parser.add_argument('-w', help='Top words file',
default='processed_data/top_words.csv')
parser.add_argument('-t', help='Topic tables directory',
default='paper/tables/')
parser.add_argument('-o', help = 'RData output file location',
default = 'paper/data/topic_model_data.RData')
args = parser.parse_args()
# Make the top_words tables
tw = pd.read_csv(args.w)
# Add names
tw.columns = topic_names
# Save as 2 different tables, because they are too long
if not os.path.exists(args.t):
os.makedirs(args.t)
tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6])
tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:])
# Load the abstracts and topics data
df = pd.read_csv(args.a)
n_topics = len(tw.columns)
# Change to datetime
df.date = pd.to_datetime(df.date)
# Remove papers from 2016 since we don't have the entire year, so graphs are misleading
df = df[df.date <= pd.to_datetime('2015-12-31')]
df = df.set_index('date')
# Rename the last columns as the topic names
df.columns = list(df.columns[:-n_topics]) + topic_names
# Group by year, and get only the LDA columns
topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]]
# Get summary statistics for each topic
# Total amount published in each topic by year
topic_sums = topics_by_year.sum()
# Mean amount published in each topic
topic_means = topics_by_year.mean()
# Now, we weight the contributions by how much a paper has been cited.
# Remember, each document has a distribution of topics that it belongs to, so a given document might look like:
# T1: .5
# T2: .3
# T3: 0
# T4: .2
# To account for how influential a paper is, we take all of the topic columns for a document
# and multiplies their weights by the logged citations the paper has received.
citation_weighted_topics = df[df.columns[-n_topics:]]
citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0)
weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum()
## write data to R
# import code to write r modules and create our variable we'll write to
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = {'weighted_sums' : weighted_sums,
'topic_sums' : topic_sums,
'topic_means' : topic_means }
for var_name, x in r.items():
robjects.r.assign(var_name.replace("_", "."), x)
if not os.path.exists(os.path.dirname(args.o)):
os.makedirs(os.path.dirname(args.o))
robjects.r('save({},file = "{}")'.format(
",".join([k.replace("_", ".") for k in r.keys()]),
args.o
))
robjects.r("rm(list=ls())")
if __name__ == '__main__':
main()