initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
2493
code/bibliometrics/00_citation_network_analysis.ipynb
Normal file
2493
code/bibliometrics/00_citation_network_analysis.ipynb
Normal file
File diff suppressed because one or more lines are too long
232
code/bibliometrics/00_citation_network_analysis.py
Normal file
232
code/bibliometrics/00_citation_network_analysis.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# coding: utf-8
|
||||
# # Import data and get things setup
|
||||
|
||||
import random
|
||||
random.seed(9001)
|
||||
|
||||
# import code to write r modules and create our variable we'll write to
|
||||
import rpy2.robjects as robjects
|
||||
from rpy2.robjects import pandas2ri
|
||||
pandas2ri.activate()
|
||||
|
||||
r = {}
|
||||
def remember(name, x):
|
||||
r[name] = x
|
||||
|
||||
# load in modules we'll need for analysis
|
||||
import subprocess
|
||||
import csv
|
||||
from igraph import *
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
|
||||
# grab the largest connected compontent with a little function
|
||||
def get_largest_component(g):
|
||||
g_components = g.components(mode="WEAK")
|
||||
max_size = max(g_components.sizes())
|
||||
for g_tmp in g_components.subgraphs():
|
||||
if g_tmp.vcount() == max_size:
|
||||
return(g_tmp)
|
||||
|
||||
# look the full edgelist into igraph
|
||||
def edge_list_iter(df):
|
||||
for i, row in df.iterrows():
|
||||
yield (row['from'], row['to'])
|
||||
|
||||
# list top 5 journals for each of the clusters
|
||||
def top_journals_for_clusters(clu):
|
||||
articles_tmp = pd.merge(clu, articles[['eid', 'source_title']])
|
||||
|
||||
output = pd.DataFrame()
|
||||
for cid in articles_tmp['cluster'].unique():
|
||||
journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5)
|
||||
tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts })
|
||||
output = output.append(tmp)
|
||||
|
||||
output = output.reset_index()
|
||||
output = output.rename(columns = {'index' : "journal"})
|
||||
return(output)
|
||||
|
||||
def infomap_edgelist(g, edgelist_filename, directed=True):
|
||||
nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index,
|
||||
'eid' : v['name']} for v in g.vs ])
|
||||
|
||||
# write out the edgelist to an external file so we can call infomap on it
|
||||
with open("code/bibliometrics/" + edgelist_filename + ".txt", 'w') as f:
|
||||
for e in g.es:
|
||||
if e.source != e.target:
|
||||
if 'weight' in e.attributes():
|
||||
print("{}\t{}\t{}".format(e.source, e.target, e['weight']), file=f)
|
||||
else:
|
||||
print("{}\t{}".format(e.source, e.target), file=f)
|
||||
|
||||
|
||||
# run the external program to generate the infomap clustering
|
||||
infomap_cmdline = ["code/bibliometrics/infomap/Infomap", "code/bibliometrics/" + edgelist_filename + ".txt", "code/bibliometrics/output_dir -z --map --clu --tree"]
|
||||
if directed:
|
||||
infomap_cmdline.append("-d")
|
||||
subprocess.call(infomap_cmdline)
|
||||
|
||||
# load up the clu data
|
||||
clu = pd.read_csv("code/bibliometrics/output_dir/" + edgelist_filename + ".clu",
|
||||
header=None, comment="#", delim_whitespace=True)
|
||||
clu.columns = ['node_infomap', 'cluster', 'flow']
|
||||
|
||||
return pd.merge(clu, nodes_tmp, on="node_infomap")
|
||||
|
||||
|
||||
def write_graphml(g, clu, graphml_filename):
|
||||
clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap')
|
||||
g.vs["cluster"] = clu["cluster"].tolist()
|
||||
g.write_graphml("code/bibliometrics/" + graphml_filename)
|
||||
|
||||
|
||||
# load article data
|
||||
articles = pd.read_csv("processed_data/abstracts.tsv", delimiter="\t")
|
||||
|
||||
# # network for just the central "social media" set
|
||||
|
||||
# this contains the list of all INCOMING citations to for paper in the original set
|
||||
raw_edgelist = pd.read_csv("processed_data/social_media_edgelist.txt", delimiter="\t")
|
||||
|
||||
g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True)
|
||||
|
||||
|
||||
g_sm = get_largest_component(g_sm_all)
|
||||
g_sm = g_sm.simplify()
|
||||
|
||||
g_sm_clu = infomap_edgelist(g_sm, "sm_edgelist_infomap", directed=True)
|
||||
|
||||
g_sm_clu['cluster'].value_counts()
|
||||
|
||||
write_graphml(g_sm, g_sm_clu, "g_sm.graphml")
|
||||
|
||||
|
||||
# # larger network that contains the incoming cites to citing articles
|
||||
|
||||
# this contains the list of all INCOMING citations to everything in the original set
|
||||
# plus every INCOMING citation to every paper that cites one of those papers
|
||||
raw_edgelist_files = ["processed_data/citation_edgelist.txt",
|
||||
"processed_data/social_media_edgelist.txt"]
|
||||
combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter="\t") for x in raw_edgelist_files])
|
||||
|
||||
|
||||
g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True)
|
||||
|
||||
g_full = get_largest_component(g_full_all)
|
||||
g_full = g_full.simplify()
|
||||
|
||||
|
||||
g_full_clu = infomap_edgelist(g_full, "citation_edglist_infomap", directed=True)
|
||||
|
||||
|
||||
g_full_clu['cluster'].value_counts()
|
||||
|
||||
top_journals_for_clusters(g_full_clu)
|
||||
|
||||
write_graphml(g_full, g_full_clu, "g_full.graphml")
|
||||
|
||||
|
||||
# # create the meta-network of connections between clusters
|
||||
|
||||
edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[["eid", "cluster"]], how="inner", left_on="to", right_on="eid")
|
||||
edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'})
|
||||
edgelist_tmp.drop('eid', 1, inplace=True)
|
||||
|
||||
edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[["eid", "cluster"]], how="inner", left_on="from", right_on="eid")
|
||||
edgelist_tmp = edgelist_tmp.rename(columns={"cluster" : 'from_cluster'})
|
||||
edgelist_tmp.drop('eid', 1, inplace=True)
|
||||
|
||||
edgelist_tmp = edgelist_tmp[["to_cluster", "from_cluster"]]
|
||||
edgelist_tmp = edgelist_tmp[edgelist_tmp["to_cluster"] != edgelist_tmp["from_cluster"]]
|
||||
|
||||
cluster_edgelist = pd.crosstab(edgelist_tmp["to_cluster"], edgelist_tmp["from_cluster"])
|
||||
cluster_edgelist["to_cluster"] = cluster_edgelist.index
|
||||
|
||||
cluster_edgelist = pd.melt(cluster_edgelist, id_vars=["to_cluster"])
|
||||
cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']]
|
||||
|
||||
remember("cluster_edgelist", cluster_edgelist)
|
||||
|
||||
top_clusters = g_sm_clu["cluster"].value_counts().head(6).index
|
||||
|
||||
# write the edgelist for the total number of clusters (currently 1-6)
|
||||
cluster_edgelist_output = cluster_edgelist[(cluster_edgelist["to_cluster"].isin(top_clusters)) &
|
||||
(cluster_edgelist["from_cluster"].isin(top_clusters))]
|
||||
|
||||
cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output["value"] > 0]
|
||||
|
||||
g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[["from_cluster", "to_cluster"]].values], directed=True)
|
||||
g_cluster.es["weight"] = cluster_edgelist_output["value"].tolist()
|
||||
|
||||
# assign the number of total articles as an attribute for each node
|
||||
g_cluster.vs["papers"] = g_sm_clu["cluster"].value_counts()[[x["name"] for x in g_cluster.vs]].tolist()
|
||||
|
||||
g_cluster.write_graphml("code/bibliometrics/clusters.graphml")
|
||||
|
||||
# # create network stats for tables (overall and within clusters)
|
||||
|
||||
def create_network_stats(g):
|
||||
network_stats = pd.DataFrame({'eid' : g.vs['name'],
|
||||
'eig_cent' : g.eigenvector_centrality(),
|
||||
'indegree' : g.indegree(),
|
||||
'betweenness' : g.betweenness()})
|
||||
|
||||
network_stats = pd.merge(network_stats,
|
||||
articles[['eid', 'title', 'source_title']],
|
||||
how="inner")
|
||||
return network_stats
|
||||
|
||||
network_stats = create_network_stats(g_full)
|
||||
|
||||
network_stats.sort_values("indegree", ascending=False).head(4)
|
||||
|
||||
network_stats.sort_values("eig_cent", ascending=False).head(4)
|
||||
|
||||
network_stats.sort_values("betweenness", ascending=False).head(4)
|
||||
|
||||
# # things to store
|
||||
remember('total_articles', articles.shape[0])
|
||||
|
||||
# total number of citations in the sm dataset
|
||||
remember('sm_citations', raw_edgelist.shape[0])
|
||||
|
||||
remember('sm_citing', len(raw_edgelist["from"].unique()))
|
||||
|
||||
# the number of articles in the original dataset that have any INCOMING citations
|
||||
remember('sm_cited', len(raw_edgelist["to"].unique()))
|
||||
|
||||
# total number of citations in the sm dataset
|
||||
remember('all_citations', combo_raw_edgelist.shape[0])
|
||||
|
||||
remember('all_citing', len(combo_raw_edgelist["from"].unique()))
|
||||
|
||||
# the number of articles in the original dataset that have any INCOMING citations
|
||||
remember('all_cited', len(combo_raw_edgelist["to"].unique()))
|
||||
|
||||
remember('g_sm_clusters', g_sm_clu[["eid", "cluster"]])
|
||||
|
||||
sorted(r.keys())
|
||||
|
||||
#save the r function to rdata file
|
||||
def save_to_r(r_dict, filename="output.RData"):
|
||||
for var_name, x in r.items():
|
||||
var_name = var_name.replace('_', '.')
|
||||
if type(x) == np.int64:
|
||||
x = np.asscalar(x)
|
||||
|
||||
if type(x) == pd.DataFrame:
|
||||
rx = pandas2ri.py2ri(x)
|
||||
else:
|
||||
rx = x
|
||||
|
||||
robjects.r.assign(var_name, x)
|
||||
|
||||
# create a new variable called in R
|
||||
robjects.r("r <- sapply(ls(), function (x) {eval(parse(text=x))})")
|
||||
robjects.r('save("r", file="{}")'.format(filename))
|
||||
robjects.r("rm(list=ls())")
|
||||
|
||||
save_to_r(r, "paper/data/network_data.RData")
|
||||
|
||||
BIN
code/bibliometrics/clusters.gephi
Normal file
BIN
code/bibliometrics/clusters.gephi
Normal file
Binary file not shown.
BIN
code/bibliometrics/g_sm.gephi
Normal file
BIN
code/bibliometrics/g_sm.gephi
Normal file
Binary file not shown.
24
code/data_collection/00_get_search_results.py
Normal file
24
code/data_collection/00_get_search_results.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import argparse
|
||||
from request_functions import *
|
||||
|
||||
'''
|
||||
This script takes in a search query and an output file. It queries the scopus API to find all papers that match the search query, and saves them to the output file.
|
||||
|
||||
Unlike some of the other scripts in this directory, it does not try to determine the state - if you restart the script, it will start over and blow away whatever you had saved before.
|
||||
'''
|
||||
|
||||
years = range(2004, 2017)
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Output JSON of all articles matching search query')
|
||||
parser.add_argument('-q', help='Search query', required=True)
|
||||
parser.add_argument('-o', help='Where to append JSON results')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.o, 'w') as out_file:
|
||||
for year in years:
|
||||
get_search_results(args.q, out_file, year=year)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
56
code/data_collection/01_get_abstracts.py
Normal file
56
code/data_collection/01_get_abstracts.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from request_functions import *
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Output JSON of abstracts and bibliography of all articles passed in.')
|
||||
parser.add_argument('-i', help='JSON file which includes eids')
|
||||
parser.add_argument('--eid', '-e', help='Single eid')
|
||||
parser.add_argument('-o', help='Where to append JSON results')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.eid:
|
||||
eids = [args.eid]
|
||||
elif args.i:
|
||||
with open(args.i, 'r') as f:
|
||||
eids = [json.loads(line)['eid'] for line in f]
|
||||
else:
|
||||
print('Need to either pass in an eid or a json file with eids')
|
||||
|
||||
# If the script gets interrupted, we need to start where we left off
|
||||
try:
|
||||
errors = []
|
||||
with open(args.o, 'r') as f:
|
||||
completed_eids = []
|
||||
for line in f:
|
||||
try:
|
||||
result = json.loads(line)
|
||||
completed_eids.append(result['abstracts-retrieval-response']['coredata']['eid'])
|
||||
except ValueError:
|
||||
errors.append(line)
|
||||
except IOError as e:
|
||||
completed_eids = []
|
||||
|
||||
|
||||
print('{} completed eids'.format(len(completed_eids)))
|
||||
with open(args.o, 'a') as out_file:
|
||||
for eid in eids:
|
||||
if eid not in completed_eids:
|
||||
result = get_abstract(eid)
|
||||
if result:
|
||||
out_file.write(result)
|
||||
out_file.write('\n')
|
||||
else:
|
||||
errors.append(eid)
|
||||
|
||||
if len(errors) > 0:
|
||||
with open('raw_data/missing_eids.json', 'a') as l:
|
||||
# Add the bad lines from the output file
|
||||
(l.write(e) for e in errors)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
43
code/data_collection/02_get_cited_by.py
Normal file
43
code/data_collection/02_get_cited_by.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from request_functions import *
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
from os import remove
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Output JSON of all articles which cite the articles passed in')
|
||||
parser.add_argument('-i', help='JSON file which includes eids and citedby-count')
|
||||
parser.add_argument('-o', help='Where to append JSON results')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as f:
|
||||
# Make a dictionary of eid:citation count for each line in the file
|
||||
eids = {}
|
||||
for line in f:
|
||||
l = json.loads(line)
|
||||
eids[l['eid']] = l['citedby-count']
|
||||
|
||||
# If the script gets interrupted, we need to start where we left off
|
||||
try:
|
||||
# Open the output file, and grab all of the eids which are already completed
|
||||
with open(args.o, 'r') as f:
|
||||
completed_eids = [json.loads(l)['parent_eid'] for l in f]
|
||||
# Remove those which came from the last id (since we may have missed some)
|
||||
if len(completed_eids) > 0:
|
||||
last_eid = completed_eids.pop()
|
||||
# Remove all of the lines which came from the last eid
|
||||
subprocess.call(['sed', '-i.bak', '/parent_eid": "{}/d'.format(last_eid), args.o])
|
||||
# Hopefully everything has worked out, because here we blow away the backup
|
||||
remove('{}.bak'.format(args.o))
|
||||
except IOError:
|
||||
# If the file doesn't exist, then there aren't any completed eids
|
||||
completed_eids = []
|
||||
|
||||
with open(args.o, 'a') as out_file:
|
||||
for eid, citation_count in eids.items():
|
||||
if citation_count != '0' and eid not in completed_eids:
|
||||
get_cited_by(eid, out_file)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
166
code/data_collection/request_functions.py
Normal file
166
code/data_collection/request_functions.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from scopus_api import key as API_KEY
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
import re
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
RETRY_COUNT = 5
|
||||
TIMEOUT_SECS = 10
|
||||
|
||||
# Initialize a global session object
|
||||
s = requests.Session()
|
||||
s.headers.update({'X-ELS-APIKey' : API_KEY,
|
||||
'X-ELS-ResourceVersion' : 'XOCS',
|
||||
'Accept' : 'application/json'})
|
||||
|
||||
def get_token(location_id = None):
|
||||
'''Given a location_id, gets an authentication token'''
|
||||
print('Getting a token')
|
||||
api_resource = 'http://api.elsevier.com/authenticate'
|
||||
# Parameters
|
||||
payload = {'platform':'SCOPUS',
|
||||
'choice': location_id}
|
||||
r = s.get(api_resource, params = payload)
|
||||
r.raise_for_status()
|
||||
s.headers['X-ELS-AuthToken'] = r.json()['authenticate-response']['authtoken']
|
||||
|
||||
def get_search_results(query, output_file, results_per_call = 200,
|
||||
tot_results=None, year=None, sort='+title', citation_call=False):
|
||||
'''Handles getting search results. Takes a query and an output
|
||||
file. Writes as many of the search results as possible to the
|
||||
output file as JSON dictionaries, one per line.'''
|
||||
result_set = []
|
||||
results_added = 0
|
||||
def curr_call(start=0, count=results_per_call):
|
||||
'''Shorthand for the current call: DRY'''
|
||||
return make_search_call(query, start=start,
|
||||
count=count, year=year, sort=sort)
|
||||
if tot_results == None:
|
||||
# Call the API initially to figure out how many results there are, and write the results
|
||||
initial_results = curr_call(count=results_per_call)
|
||||
tot_results = int(initial_results['search-results']['opensearch:totalResults'])
|
||||
result_set.append((initial_results, sort))
|
||||
results_added += results_per_call
|
||||
logging.debug("Total results: {}".format(tot_results))
|
||||
|
||||
if tot_results == 0:
|
||||
return None
|
||||
if tot_results > 5000:
|
||||
# If this is just one year, we can't get any more granular, and
|
||||
# we need to return what we can.
|
||||
if tot_results > 10000:
|
||||
print("{} results for {}. We can only retrieve 10,000".format(tot_results, year))
|
||||
first_half = last_half = 5000
|
||||
else:
|
||||
# Get half, and correct for odd # of results
|
||||
first_half = tot_results//2 + tot_results % 2
|
||||
last_half = tot_results//2
|
||||
# Break the search into the first half and the bottom half of results.
|
||||
get_search_results(query, output_file,
|
||||
year = year,
|
||||
tot_results=first_half)
|
||||
# Get the other half
|
||||
get_search_results(query, output_file,
|
||||
year = year,
|
||||
tot_results = last_half, sort='-title')
|
||||
# If there are 5000 or fewer to retrieve, then get them
|
||||
else:
|
||||
logging.debug('Retrieving {} results'.format(tot_results))
|
||||
# As long as there are more citations to retrieve, then do it, and write
|
||||
# them to the file
|
||||
while results_added < tot_results:
|
||||
# If we are near the end, then only get as many results as are left.
|
||||
to_retrieve = min(results_per_call, (tot_results - results_added))
|
||||
curr_results = curr_call(start=results_added, count=to_retrieve)
|
||||
result_set.append((curr_results, sort))
|
||||
results_added += results_per_call
|
||||
# This is hacky, but I'm doing it
|
||||
# If this is a citation call, then construct metadata to be written with the result
|
||||
if citation_call:
|
||||
metadata = {'parent_eid': re.match(r'refeid\((.*)\)', query).group(1)}
|
||||
else:
|
||||
metadata = {}
|
||||
write_results(result_set, output_file, metadata)
|
||||
|
||||
def write_results(result_set, output_file, metadata={}):
|
||||
for x in result_set:
|
||||
search_json = x[0]
|
||||
to_reverse = x[1].startswith('-')
|
||||
try:
|
||||
results = [x for x in search_json['search-results']['entry']]
|
||||
except KeyError:
|
||||
raise
|
||||
if to_reverse:
|
||||
results = results[::-1]
|
||||
for x in results:
|
||||
for k, v in metadata.items():
|
||||
x[k] = v
|
||||
json.dump(x, output_file)
|
||||
output_file.write('\n')
|
||||
|
||||
|
||||
def make_search_call(query, start=0, count=200,
|
||||
sort='+title', year=None,
|
||||
retry_limit = RETRY_COUNT,
|
||||
timeout_secs = TIMEOUT_SECS):
|
||||
api_resource = "https://api.elsevier.com/content/search/scopus"
|
||||
# Parameters
|
||||
payload = {'query':query,
|
||||
'count':count,
|
||||
'start':start,
|
||||
'sort': sort,
|
||||
'date': year}
|
||||
for _ in range(retry_limit):
|
||||
try:
|
||||
r = s.get(api_resource,
|
||||
params = payload,
|
||||
timeout = timeout_secs)
|
||||
logging.debug(r.url)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
if r.status_code == 400:
|
||||
raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
|
||||
break
|
||||
except requests.exceptions.Timeout:
|
||||
pass
|
||||
else:
|
||||
raise requests.exceptions.Timeout('Timeout Error')
|
||||
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_cited_by(eid, output_file):
|
||||
return get_search_results('refeid({})'.format(eid), output_file, results_per_call=200,
|
||||
citation_call = True)
|
||||
|
||||
|
||||
def get_abstract(eid, retry_limit = RETRY_COUNT,
|
||||
timeout_secs = TIMEOUT_SECS):
|
||||
api_resource = "http://api.elsevier.com/content/abstract/eid/{}".format(eid)
|
||||
# Parameters
|
||||
payload = {}
|
||||
for _ in range(retry_limit):
|
||||
try:
|
||||
r = s.get(api_resource,
|
||||
params = payload,
|
||||
timeout = timeout_secs)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
if r.status_code == 400:
|
||||
raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
|
||||
break
|
||||
except requests.exceptions.Timeout:
|
||||
pass
|
||||
else:
|
||||
raise requests.exceptions.Timeout('Timeout Error')
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
return r.content.decode('utf-8')
|
||||
1
code/data_collection/scopus_api.py
Normal file
1
code/data_collection/scopus_api.py
Normal file
@@ -0,0 +1 @@
|
||||
key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
|
||||
177
code/data_processing/00_abstracts_to_tsv.py
Normal file
177
code/data_processing/00_abstracts_to_tsv.py
Normal file
@@ -0,0 +1,177 @@
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
import json
|
||||
import argparse
|
||||
import csv
|
||||
import random
|
||||
|
||||
random.seed(2017)
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Change a big ugly abstract file to a nice CSV')
|
||||
parser.add_argument('-i', help='Abstract file')
|
||||
parser.add_argument('-o', help='TSV output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as i:
|
||||
with open(args.o, 'w') as o:
|
||||
# Have to get the field names
|
||||
first_line = clean_abstract(json.loads(next(i)))
|
||||
fieldnames = first_line.keys()
|
||||
output = csv.DictWriter(o, fieldnames, delimiter='\t')
|
||||
output.writeheader()
|
||||
output.writerow(first_line)
|
||||
for line in i:
|
||||
output.writerow(clean_abstract(json.loads(line)))
|
||||
|
||||
|
||||
def clean_abstract(json_response):
|
||||
result = json_response['abstracts-retrieval-response']
|
||||
head = result['item']['bibrecord']['head']
|
||||
try:
|
||||
attributes = {
|
||||
'modal_country': get_country(head),
|
||||
'abstract' : get_abstract(result),
|
||||
'title' : get_title(result),
|
||||
'source_title': get_source_title(head),
|
||||
'language': result['language']['@xml:lang'],
|
||||
'first_ASJC_subject_area': get_subject(result, '$'),
|
||||
'first_ASJC_classification': get_subject(result, '@code'),
|
||||
'first_CPX_class': get_CPX_class(head, 'classification-description'),
|
||||
'date': to_date(result['coredata']['prism:coverDate']),
|
||||
'aggregation_type' : if_exists('prism:aggregationType',result['coredata'],else_val='NA'),
|
||||
'eid' : result['coredata']['eid'],
|
||||
'cited_by_count': result['coredata']['citedby-count'],
|
||||
'num_citations': get_citation_count(result)
|
||||
}
|
||||
except KeyError:
|
||||
raise
|
||||
except TypeError:
|
||||
# print(result)
|
||||
raise
|
||||
return attributes
|
||||
|
||||
def get_citation_count(result):
|
||||
try:
|
||||
return result['item']['bibrecord']['tail']['bibliography']['@refcount']
|
||||
except TypeError:
|
||||
return None
|
||||
|
||||
def get_title(result):
|
||||
try:
|
||||
return result['coredata']['dc:title']
|
||||
except KeyError:
|
||||
raise
|
||||
|
||||
|
||||
def get_source_title(head):
|
||||
try:
|
||||
return head['source']['sourcetitle']
|
||||
except KeyError:
|
||||
raise
|
||||
|
||||
def get_abstract(result):
|
||||
try:
|
||||
abstract = result['coredata']['dc:description']
|
||||
abstract = abstract.replace('\n',' ')
|
||||
return abstract
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
def get_auth_names(head):
|
||||
try:
|
||||
auth_info = [x['author'] for x in make_list(head['author-group'])]
|
||||
except KeyError:
|
||||
print(head)
|
||||
auth_names = []
|
||||
for auth_group in auth_info:
|
||||
for auth in make_list(auth_group):
|
||||
auth_names.append('{} {}'.format(
|
||||
auth['preferred-name']['ce:given-name'],
|
||||
auth['preferred-name']['ce:surname']))
|
||||
return auth_names
|
||||
|
||||
def get_country(head):
|
||||
all_countries = get_aff_info(head, 'country')
|
||||
if all_countries:
|
||||
# Find the mode. If there's more than one, choose randomly
|
||||
modes = Counter
|
||||
s = set(all_countries)
|
||||
max_count = max([all_countries.count(x) for x in s])
|
||||
modes = [x for x in s if all_countries.count(x) == max_count]
|
||||
return random.choice(modes)
|
||||
|
||||
def get_aff_info(head, affiliation_key):
|
||||
aff_info = []
|
||||
try:
|
||||
authors = make_list(head['author-group'])
|
||||
except KeyError:
|
||||
return None
|
||||
for x in authors:
|
||||
try:
|
||||
num_auth = len(make_list(x['author']))
|
||||
except KeyError:
|
||||
# Apparently there are things called "collaborations", which don't have affiliation info.
|
||||
# I'm just skipping them
|
||||
continue
|
||||
except TypeError:
|
||||
# And apparently "None" appears in the author list for no reason. :)
|
||||
continue
|
||||
try:
|
||||
curr_inst = x['affiliation'][affiliation_key]
|
||||
# Add one instance for each author from this institution
|
||||
aff_info += [curr_inst] * num_auth
|
||||
except KeyError:
|
||||
# If there isn't affiliation info for these authors, return empty str
|
||||
aff_info += [''] * num_auth
|
||||
return aff_info
|
||||
|
||||
def get_keywords(head):
|
||||
cite_info = head['citation-info']
|
||||
try:
|
||||
keywords = [x for x in
|
||||
make_list(cite_info['author-keywords']['author-keyword'])]
|
||||
# When there's only one keyword, it's a string. Otherwise, we will
|
||||
# have a list of dictionaries
|
||||
if len(keywords) == 1:
|
||||
return keywords
|
||||
else:
|
||||
return [x['$'] for x in keywords]
|
||||
except KeyError:
|
||||
return None
|
||||
|
||||
def get_subject(result, key):
|
||||
try:
|
||||
return [x[key] for x in make_list(result['subject-areas']['subject-area'])][0]
|
||||
except KeyError:
|
||||
print(result)
|
||||
raise
|
||||
|
||||
def get_CPX_class(head, class_key):
|
||||
try:
|
||||
for x in head['enhancement']['classificationgroup']['classifications']:
|
||||
if x['@type'] == 'CPXCLASS':
|
||||
try:
|
||||
return [y[class_key] for y in make_list(x['classification'])][0]
|
||||
except (KeyError, TypeError):
|
||||
return None
|
||||
except KeyError:
|
||||
print(head['enhancement']['classificationgroup'])
|
||||
raise
|
||||
|
||||
def to_date(date_string):
|
||||
return datetime.strptime(date_string, '%Y-%m-%d')
|
||||
|
||||
|
||||
def if_exists(key, dictionary, else_val = None):
|
||||
try:
|
||||
return dictionary[key]
|
||||
except KeyError:
|
||||
return else_val
|
||||
|
||||
def make_list(list_or_dict):
|
||||
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
25
code/data_processing/01_cited_by_to_edgelist.py
Normal file
25
code/data_processing/01_cited_by_to_edgelist.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from datetime import datetime
|
||||
import json
|
||||
import argparse
|
||||
import csv
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Make a citation network from the cited_by json')
|
||||
parser.add_argument('-i', help='Cited_by file')
|
||||
parser.add_argument('-o', help='TSV output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as i:
|
||||
with open(args.o, 'w') as o:
|
||||
output = csv.writer(o, delimiter = '\t')
|
||||
output.writerow(['to','from', 'date'])
|
||||
for line in i:
|
||||
line = json.loads(line)
|
||||
output.writerow([line['parent_eid'], line['eid'], line['prism:coverDate']])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
29
code/data_processing/02_filter_edgelist.py
Normal file
29
code/data_processing/02_filter_edgelist.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import argparse
|
||||
import csv
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Take the edgelist, and reduce it to just the papers which are in our search')
|
||||
parser.add_argument('-i', help='Full edgelist file')
|
||||
parser.add_argument('-o', help='Edgelist output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as in_file:
|
||||
i = csv.reader(in_file, delimiter= '\t')
|
||||
next(i) # Discard header
|
||||
# Get the list of nodes to keep
|
||||
nodes = set([x[0] for x in i])
|
||||
in_file.seek(0) # Start over at the beginning
|
||||
with open(args.o, 'w') as o:
|
||||
output = csv.writer(o, delimiter = '\t')
|
||||
output.writerow(['to','from', 'date'])
|
||||
for line in i:
|
||||
# If the both items are in nodes, then keep the line
|
||||
if line[1] in nodes:
|
||||
output.writerow(line)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
62
code/data_processing/03_make_paper_aff_table.py
Normal file
62
code/data_processing/03_make_paper_aff_table.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import json
|
||||
import argparse
|
||||
import csv
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Generate paper to affiliation mapping file from abstracts file')
|
||||
parser.add_argument('-i', help='Abstract file')
|
||||
parser.add_argument('-o', help='TSV output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as i:
|
||||
with open(args.o, 'w') as o:
|
||||
output = csv.writer(o, delimiter='\t')
|
||||
output.writerow(['paper_eid','affiliation_id',
|
||||
'organization','country'])
|
||||
for line in i:
|
||||
entries = get_entries(line)
|
||||
for entry in entries:
|
||||
output.writerow(entry)
|
||||
|
||||
|
||||
def get_entries(l):
|
||||
json_response = json.loads(l)
|
||||
full = json_response['abstracts-retrieval-response']
|
||||
head = full['item']['bibrecord']['head']
|
||||
eid = full['coredata']['eid']
|
||||
countries = get_aff_info(head, 'country')
|
||||
affiliation_ids = get_aff_info(head, '@afid')
|
||||
org_names = get_aff_info(head, 'organization')
|
||||
if countries:
|
||||
result = [[eid, affiliation_ids[i], org_names[i], countries[i]]
|
||||
for i in range(len(countries))]
|
||||
return result
|
||||
return []
|
||||
|
||||
def get_aff_info(head, affiliation_key):
|
||||
aff_info = []
|
||||
try:
|
||||
affiliations = make_list(head['author-group'])
|
||||
except KeyError:
|
||||
return None
|
||||
for x in affiliations:
|
||||
if x is None:
|
||||
continue
|
||||
try:
|
||||
curr_inst = x['affiliation'][affiliation_key]
|
||||
# May return a string or a list. If it's a list, then
|
||||
# return the final value of that list (This is the base organization)
|
||||
if isinstance(curr_inst, list):
|
||||
curr_inst = [x['$'] for x in curr_inst][-1]
|
||||
aff_info.append(curr_inst)
|
||||
except KeyError:
|
||||
# If there isn't affiliation info for these authors, return empty str
|
||||
aff_info.append('')
|
||||
return aff_info
|
||||
|
||||
def make_list(list_or_dict):
|
||||
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
50
code/data_processing/04_make_paper_subject_table.py
Normal file
50
code/data_processing/04_make_paper_subject_table.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import json
|
||||
import argparse
|
||||
import csv
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Generate paper to subject mapping file from abstracts file')
|
||||
parser.add_argument('-i', help='Abstract file')
|
||||
parser.add_argument('-o', help='TSV output file')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.i, 'r') as i:
|
||||
with open(args.o, 'w') as o:
|
||||
output = csv.writer(o, delimiter='\t')
|
||||
output.writerow(['paper_eid','subject',
|
||||
'subject_code'])
|
||||
for line in i:
|
||||
entries = get_entries(line)
|
||||
for entry in entries:
|
||||
output.writerow(entry)
|
||||
|
||||
|
||||
def get_entries(l):
|
||||
json_response = json.loads(l)
|
||||
full = json_response['abstracts-retrieval-response']
|
||||
eid = full['coredata']['eid']
|
||||
subjects = get_subjects(full)
|
||||
# Prepend the eid, and return the subjects
|
||||
return [[eid,s[0],s[1]] for s in subjects]
|
||||
return []
|
||||
|
||||
|
||||
def get_subjects(abstract_response):
|
||||
try:
|
||||
subject_info = make_list(abstract_response['subject-areas']['subject-area'])
|
||||
except KeyError:
|
||||
print(result)
|
||||
raise
|
||||
result = []
|
||||
for s in subject_info:
|
||||
# Get the subject name and code, and append them
|
||||
result.append([s['$'],s['@code']])
|
||||
return result
|
||||
|
||||
|
||||
def make_list(list_or_dict):
|
||||
return list_or_dict if isinstance(list_or_dict, list) else [list_or_dict]
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
17
code/data_processing/05_save_descriptives.R
Normal file
17
code/data_processing/05_save_descriptives.R
Normal file
@@ -0,0 +1,17 @@
|
||||
df = read.csv('processed_data/abstracts.tsv',sep='\t', strip.white=TRUE)
|
||||
df['date'] = as.Date(df$date)
|
||||
df$modal_country[df['modal_country'] == ''] <- NA
|
||||
df['year'] = format(df['date'],'%Y')
|
||||
|
||||
abstracts <- df[df['abstract'] != '',c('eid','abstract')]
|
||||
# Creates a vector of word counts, based on counting all of the groups of alphanumeric characters
|
||||
word_count <- apply(abstracts, 1, function(x) sapply(gregexpr("[[:alnum:]]+", x['abstract']), function(x) sum(x > 0)))
|
||||
|
||||
s = read.csv('processed_data/paper_subject_table.tsv', sep='\t')
|
||||
full <- merge(df,s, by.x = 'eid', by.y = 'paper_eid')
|
||||
|
||||
# zero these out before we save them so we don't save all of the abstracts.
|
||||
full['abstract'] <- NULL
|
||||
df['abstract'] <- NULL
|
||||
|
||||
save(df, abstracts, s, full, word_count, file="paper/data/orig_data_sets.RData")
|
||||
26
code/data_processing/make_network.py
Normal file
26
code/data_processing/make_network.py
Normal file
@@ -0,0 +1,26 @@
|
||||
'''Takes a CSV of retrieved articles, and creates an igraph
|
||||
network from them (not even close to done)'''
|
||||
|
||||
class CitationNetwork(igraph.Graph):
|
||||
def __init__(self, network_type):
|
||||
super().__init__(directed=True)
|
||||
self.temp_edges = []
|
||||
self.temp_vertices = []
|
||||
self.network_type = network_type
|
||||
|
||||
def add_vertices(self, to_node, from_nodes):
|
||||
self.temp_vertices += [[from_node, to_node] for from_node in from_nodes]
|
||||
|
||||
def make_network(self):
|
||||
# Get the unique set of nodes, and add them.
|
||||
nodes = set([v for v in self.temp_vertices if v['eid'] not in self.vs['name']])
|
||||
nodes = sorted(nodes)
|
||||
self.add_vertices(nodes)
|
||||
self.add_edges(self.temp_edges)
|
||||
self.es['weight'] = 1
|
||||
|
||||
def collapse_weights(self):
|
||||
self.simplify(combine_edges={"weight": "sum"})
|
||||
|
||||
def add_citations(eid, citations):
|
||||
self.retrieved_eids.append(eid)
|
||||
89
code/prediction/00_ngram_extraction.py
Normal file
89
code/prediction/00_ngram_extraction.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from time import time
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
import csv
|
||||
import argparse
|
||||
|
||||
n_features = 100000 # Gets the top n_features terms
|
||||
n_samples = None # Enter an integer here for testing, so it doesn't take so long
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts')
|
||||
parser.add_argument('-i', help='Location of the abstracts file',
|
||||
default='processed_data/abstracts.tsv')
|
||||
parser.add_argument('-o', help='Location of the output file',
|
||||
default='processed_data/ngram_table.csv')
|
||||
parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams',
|
||||
default=3)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading dataset...")
|
||||
t0 = time()
|
||||
doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
# Write the header
|
||||
write_header(args.o)
|
||||
|
||||
bags_o_words = get_counts(data_samples, n_features, args.n)
|
||||
write_output(doc_ids, bags_o_words, args.o)
|
||||
|
||||
def get_counts(abstracts, n_features, ngram_max):
|
||||
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
|
||||
max_features=n_features,
|
||||
stop_words='english',
|
||||
ngram_range = (1,ngram_max))
|
||||
t0 = time()
|
||||
tf = tf_vectorizer.fit_transform(abstracts)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
terms = tf_vectorizer.get_feature_names()
|
||||
freqs = tf.toarray()
|
||||
bags_o_words = to_bags_o_words(terms, freqs)
|
||||
return bags_o_words
|
||||
|
||||
|
||||
def write_header(out_file):
|
||||
with open(out_file, 'w') as o_f:
|
||||
out = csv.writer(o_f)
|
||||
out.writerow(['document_id','term','frequency'])
|
||||
|
||||
def to_bags_o_words(terms, freqs):
|
||||
'''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document.
|
||||
The format of the dictionaries is term:count within that document.
|
||||
'''
|
||||
result = []
|
||||
for d in freqs:
|
||||
curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 }
|
||||
result.append(curr_result)
|
||||
return result
|
||||
|
||||
def write_output(ids, bags_o_words, out_file):
|
||||
with open(out_file, 'a') as o_f:
|
||||
out = csv.writer(o_f)
|
||||
for i, doc in enumerate(bags_o_words):
|
||||
for k,v in doc.items():
|
||||
# For each term and count, output a row, together with the document id
|
||||
out.writerow([ids[i],k,v])
|
||||
|
||||
def get_ids_and_abstracts(fn, length_limit):
|
||||
with open(fn, 'r') as f:
|
||||
in_csv = csv.DictReader(f, delimiter='\t')
|
||||
abstracts = []
|
||||
ids = []
|
||||
i = 1
|
||||
for r in in_csv:
|
||||
try:
|
||||
abstracts.append(r['abstract'])
|
||||
ids.append(r['eid'])
|
||||
except KeyError:
|
||||
print(r)
|
||||
if length_limit and i > length_limit:
|
||||
break
|
||||
i += 1
|
||||
return ids, abstracts
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
89
code/prediction/01-build_control_variables.R
Normal file
89
code/prediction/01-build_control_variables.R
Normal file
@@ -0,0 +1,89 @@
|
||||
source("code/prediction/utils.R")
|
||||
|
||||
# use this to store things for use in the paper
|
||||
pred.descrip <- NULL
|
||||
|
||||
abstracts <- read.delim("processed_data/abstracts.tsv", header=TRUE,
|
||||
stringsAsFactors=FALSE, sep="\t")
|
||||
|
||||
abstracts <- subset(abstracts, select = -abstract)
|
||||
|
||||
abstracts <- abstracts[abstracts$aggregation_type != "Trade Journal" &
|
||||
is.na(abstracts$aggregation_type) == FALSE, ]
|
||||
|
||||
names(abstracts)[names(abstracts) == 'num_citations'] <- 'works_cited'
|
||||
abstracts$works_cited[is.na(abstracts$works_cited) == TRUE] <- 0
|
||||
|
||||
# affiliations
|
||||
affiliations <- read.delim("processed_data/paper_aff_table.tsv",
|
||||
header=TRUE, stringsAsFactors=FALSE,
|
||||
sep="\t")
|
||||
|
||||
# eliminate missing values
|
||||
affiliations <- affiliations[!is.na(affiliations$affiliation_id) &
|
||||
affiliations$organization != "", ]
|
||||
|
||||
|
||||
remap.affiliations <- function(aff.id,
|
||||
aff.df = affiliations){
|
||||
org.modal <- names(tail(sort(table(affiliations$organization[
|
||||
affiliations$affiliation_id == aff.id])),1))
|
||||
return(org.modal)
|
||||
}
|
||||
|
||||
affiliations$organization <- sapply(affiliations$affiliation_id, remap.affiliations)
|
||||
|
||||
affiliations <- subset(affiliations, select = c(paper_eid,
|
||||
organization))
|
||||
names(affiliations) <- c("eid", "affiliation")
|
||||
|
||||
# need to remove repeat affiliations
|
||||
affiliations <- affiliations[duplicated(affiliations$eid) == FALSE,]
|
||||
|
||||
|
||||
######################################
|
||||
d <- abstracts[, c("eid", "language", "modal_country",
|
||||
"source_title", "works_cited")]
|
||||
|
||||
# dichotomous dependent variable
|
||||
d$cited <- abstracts$cited_by_count > 0
|
||||
|
||||
|
||||
# store this here for use in the paper before we run any restrictions:
|
||||
pred.descrip$cited <- d$cited
|
||||
pred.descrip$cites <- abstracts$cited_by_count
|
||||
|
||||
|
||||
# We want these to be categorical variables
|
||||
d$modal_country <- factor(d$modal_country)
|
||||
d$language <- factor(d$language)
|
||||
d$subject <- factor(abstracts$first_ASJC_subject_area)
|
||||
d$source_title <- factor(d$source_title)
|
||||
d$month <- factor(strftime(abstracts$date, format= "%m"))
|
||||
# except for pub year - keep that continuous
|
||||
d$year <- as.numeric(strftime(abstracts$date, format="%Y"))
|
||||
|
||||
# bring in org affiliations
|
||||
d <- merge(d, affiliations, by="eid") # note that this drops papers
|
||||
# w/out org info
|
||||
|
||||
d$affiliation <- factor(d$affiliation)
|
||||
|
||||
##### Restrictions:
|
||||
|
||||
### do this explicitly so that changes are easy:
|
||||
d <- restrict(d, d$affiliation, 1)
|
||||
d <- restrict(d, d$subject, 1)
|
||||
d <- restrict(d, d$source_title, 1)
|
||||
d <- restrict(d, d$language, 1)
|
||||
d <- restrict(d, d$modal_country, 1)
|
||||
|
||||
# n.authors
|
||||
# per author prior citations
|
||||
|
||||
pred.descrip$covars <- d
|
||||
save(pred.descrip, file = "paper/data/prediction_descriptives.RData")
|
||||
|
||||
|
||||
rm(d, abstracts, affiliations)
|
||||
|
||||
56
code/prediction/02-build_textual_features.R
Normal file
56
code/prediction/02-build_textual_features.R
Normal file
@@ -0,0 +1,56 @@
|
||||
library(data.table)
|
||||
|
||||
|
||||
# import ngram data
|
||||
# note that the file is not pushed to repository, but is available on
|
||||
# hyak at: /com/users/jdfoote/css_chapter/ngram_table.csv
|
||||
|
||||
# Top 100,000 ngrams (?)
|
||||
ngrams <- read.delim("processed_data/ngram_table.csv", sep=",",
|
||||
header=TRUE, stringsAsFactors=FALSE)[,-3]
|
||||
names(ngrams)[1] <- "eid"
|
||||
|
||||
subjects <- read.delim("processed_data/abstracts.tsv", header=TRUE,
|
||||
stringsAsFactors=FALSE, sep="\t")[,c("eid",
|
||||
"first_ASJC_subject_area")]
|
||||
names(subjects)[2] <- "subject"
|
||||
|
||||
# takes a couple of minutes:
|
||||
ngrams <- merge(ngrams, subjects, by="eid", all.x=TRUE)
|
||||
|
||||
# only use ngrams that occur accross all (many?) subject areas
|
||||
subject.by.ngram <- tapply(ngrams$subject, ngrams$term, function(x)
|
||||
length(unique(x)))
|
||||
|
||||
# summary(subject.by.ngram)
|
||||
#
|
||||
# library(txtplot)
|
||||
# txtdensity(log(subject.by.ngram))
|
||||
|
||||
# Note:
|
||||
# The median number of subject areas per term is five. We'll cut it
|
||||
# off at terms that occur across at least 30 subject areas.
|
||||
|
||||
top.ngrams <- ngrams[ngrams$term %in%
|
||||
names(subject.by.ngram[subject.by.ngram >
|
||||
30]),c("eid", "term")]
|
||||
|
||||
rm(ngrams, subject.by.ngram, subjects)
|
||||
|
||||
# convert to a wide format matrix of dichotomous variables
|
||||
library(reshape2)
|
||||
library(data.table)
|
||||
|
||||
top.ngrams <- data.table(top.ngrams)
|
||||
setkey(top.ngrams, eid)
|
||||
|
||||
top.ngrams[,vv:= TRUE]
|
||||
|
||||
# took more than 20 minutes on hyak
|
||||
top.ngram.matrix <- dcast(top.ngrams, eid ~ term, length,
|
||||
value.var = "vv")
|
||||
|
||||
rm(top.ngrams)
|
||||
|
||||
save(top.ngram.matrix, file="processed_data/top.ngram.matrix.RData")
|
||||
#load("processed_data/top.ngram.matrix.RData")
|
||||
221
code/prediction/03-prediction_analysis.R
Normal file
221
code/prediction/03-prediction_analysis.R
Normal file
@@ -0,0 +1,221 @@
|
||||
library(data.table)
|
||||
library(Matrix)
|
||||
library(glmnet)
|
||||
library(xtable)
|
||||
library(methods)
|
||||
|
||||
predict.list <- NULL
|
||||
|
||||
if(!exists("top.ngram.matrix")){
|
||||
load("processed_data/top.ngram.matrix.RData")
|
||||
}
|
||||
|
||||
if(!exists("pred.descrip")){
|
||||
load("paper/data/prediction_descriptives.RData")
|
||||
covars <- pred.descrip$covars
|
||||
}
|
||||
|
||||
top.ngram.matrix <- data.table(top.ngram.matrix)
|
||||
setkey(top.ngram.matrix, eid)
|
||||
covars <- data.table(pred.descrip$covars)
|
||||
setkey(covars,eid)
|
||||
|
||||
# restrict to the overlap of the two datasets
|
||||
covars <- covars[covars$eid %in% top.ngram.matrix$eid,]
|
||||
|
||||
top.ngram.matrix <- top.ngram.matrix[top.ngram.matrix$eid %in%
|
||||
covars$eid,]
|
||||
|
||||
# rename the cited column in case it doesn't appear
|
||||
names(covars)[names(covars) == 'cited'] <- 'cited.x'
|
||||
|
||||
# then merge also to facilitate some manipulations below
|
||||
d <- merge(covars, top.ngram.matrix, by="eid", all=FALSE)
|
||||
|
||||
# Note that this duplicates some column names so X gets appended in a
|
||||
# few cases.
|
||||
|
||||
# construct model matrices
|
||||
x.controls <- sparse.model.matrix(cited.x ~ language.x +
|
||||
modal_country + month.x,
|
||||
data=d)[,-1]
|
||||
|
||||
x.aff <- sparse.model.matrix(cited.x ~ affiliation, data=d)[,-1]
|
||||
x.subj <- sparse.model.matrix(cited.x ~ subject.x, data=d)[,-1]
|
||||
x.venue <- sparse.model.matrix(cited.x ~ source_title, data=d)[,-1]
|
||||
|
||||
x.ngrams <- as.matrix(subset(top.ngram.matrix, select=-eid))
|
||||
x.ngrams <- as(x.ngrams, "sparseMatrix")
|
||||
|
||||
X <- cBind(x.controls, covars$year.x, covars$works.cited)
|
||||
X.aff <- cBind(X, x.aff)
|
||||
X.subj <- cBind(X.aff, x.subj)
|
||||
X.venue <- cBind(X.subj, x.venue)
|
||||
X.terms <- cBind(X.venue, x.ngrams)
|
||||
|
||||
Y <- covars$cited
|
||||
|
||||
### Hold-back sample for testing model performance later on:
|
||||
set.seed(20160719)
|
||||
holdback.index <- sample(nrow(X), round(nrow(X)*.1))
|
||||
|
||||
X.hold <- X[holdback.index,]
|
||||
X.hold.aff <- X.aff[holdback.index,]
|
||||
X.hold.subj <- X.subj[holdback.index,]
|
||||
X.hold.venue <- X.venue[holdback.index,]
|
||||
X.hold.terms <- X.terms[holdback.index,]
|
||||
Y.hold <- Y[holdback.index]
|
||||
|
||||
X.test <- X[-holdback.index,]
|
||||
X.test.aff <- X.aff[-holdback.index,]
|
||||
X.test.subj <- X.subj[-holdback.index,]
|
||||
X.test.venue <- X.venue[-holdback.index,]
|
||||
X.test.terms <- X.terms[-holdback.index,]
|
||||
Y.test <- Y[-holdback.index]
|
||||
|
||||
############### Models and prediction
|
||||
|
||||
set.seed(20160719)
|
||||
|
||||
m.con <- cv.glmnet(X.test, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
con.pred = predict(m.con, type="class", s="lambda.min",
|
||||
newx=X.hold)
|
||||
|
||||
m.aff <- cv.glmnet(X.test.aff, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
aff.pred = predict(m.aff, type="class", s="lambda.min",
|
||||
newx=X.hold.aff)
|
||||
|
||||
m.subj <- cv.glmnet(X.test.subj, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
subj.pred = predict(m.subj, type="class", s="lambda.min",
|
||||
newx=X.hold.subj)
|
||||
|
||||
m.venue <- cv.glmnet(X.test.venue, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
venue.pred = predict(m.venue, type="class", s="lambda.min",
|
||||
newx=X.hold.venue)
|
||||
|
||||
m.terms <- cv.glmnet(X.test.terms, Y.test, alpha=1, family="binomial",
|
||||
type.measure="class")
|
||||
terms.pred = predict(m.terms, type="class", s="lambda.min",
|
||||
newx=X.hold.terms)
|
||||
|
||||
##########
|
||||
# Compare test set predictions against held-back sample:
|
||||
|
||||
pred.df <- data.frame(cbind(con.pred, aff.pred, subj.pred,
|
||||
venue.pred, terms.pred))
|
||||
names(pred.df) <- c("Controls", "+ Affiliation", "+ Subject", "+ Venue",
|
||||
"+ Terms")
|
||||
|
||||
m.list <- list(m.con, m.aff, m.subj, m.venue, m.terms)
|
||||
|
||||
# collect:
|
||||
# df
|
||||
# percent.deviance
|
||||
# nonzero coefficients
|
||||
# prediction error
|
||||
|
||||
gen.m.summ.info <- function(model){
|
||||
df <- round(tail(model$glmnet.fit$df, 1),0)
|
||||
percent.dev <- round(tail(model$glmnet.fit$dev.ratio, 1),2)*100
|
||||
cv.error <- round(tail(model$cvm,1),2)*100
|
||||
# null.dev <- round(tail(model$glmnet.fit$nulldev),0)
|
||||
out <- c(df, percent.dev, cv.error)
|
||||
return(out)
|
||||
}
|
||||
|
||||
gen.class.err <- function(pred, test){
|
||||
props <- prop.table(table(pred, test))
|
||||
err.sum <- round(sum(props[1,2], props[2,1]),2)*100
|
||||
return(err.sum)
|
||||
}
|
||||
|
||||
|
||||
results.tab <- cbind(names(pred.df),data.frame(matrix(unlist(lapply(m.list,
|
||||
gen.m.summ.info)),
|
||||
byrow=T, nrow=5)))
|
||||
|
||||
results.tab$class.err <- sapply(pred.df, function(x) gen.class.err(x,
|
||||
Y.hold))
|
||||
|
||||
results.tab <- data.frame(lapply(results.tab, as.character))
|
||||
|
||||
|
||||
|
||||
names(results.tab) <- c("Model", "N features", "Deviance (%)",
|
||||
"CV error (%)", "Hold-back error (%)")
|
||||
|
||||
|
||||
print(xtable(results.tab,
|
||||
caption=
|
||||
"Summary of fitted models predicting any citations. The ``Model'' column describes which features were included. The N features column shows the number of features included in the prediction. ``Deviance'' summarizes the goodness of fit as a percentage of the total deviance accounted for by the model. ``CV error'' (cross-validation error) reports the prediction error rates of each model in the cross-validation procedure conducted as part of the parameter estimation process. ``Hold-back error'' shows the prediction error on a random 10 percent subset of the original dataset not included in any of the model estimation procedures.",
|
||||
label='tab:predict_models', align='llrrrr'),
|
||||
include.rownames=FALSE)
|
||||
|
||||
# Store the results:
|
||||
predict.list$results.tab <- results.tab
|
||||
|
||||
|
||||
|
||||
|
||||
############# Generate most salient coefficients
|
||||
nz.coefs <- data.frame( coef =
|
||||
colnames(X.test.terms)[which(
|
||||
coef(m.terms, s="lambda.min")
|
||||
!= 0)],
|
||||
type = "term",
|
||||
beta =
|
||||
coef(m.terms,
|
||||
s="lambda.min")[which(coef(m.terms,
|
||||
s="lambda.min")
|
||||
!= 0)])
|
||||
|
||||
nz.coefs$coef <- as.character(nz.coefs$coef)
|
||||
nz.coefs$type <- as.character(nz.coefs$type)
|
||||
nz.coefs <- nz.coefs[order(-abs(nz.coefs$beta)),]
|
||||
|
||||
# comparison:
|
||||
|
||||
#nz.coefs$type <- "terms"
|
||||
nz.coefs$type[grepl("(Intercept)", nz.coefs$coef)] <- NA
|
||||
nz.coefs$type[grepl("source_title", nz.coefs$coef)] <- "venue"
|
||||
nz.coefs$type[grepl("subject.x", nz.coefs$coef)] <- "subject"
|
||||
nz.coefs$type[grepl("affiliation", nz.coefs$coef)] <- "affiliation"
|
||||
nz.coefs$type[grepl("month.x", nz.coefs$coef)] <- "month"
|
||||
nz.coefs$type[grepl("modal_country", nz.coefs$coef)] <- "country"
|
||||
nz.coefs$type[grepl("language", nz.coefs$coef)] <- "language"
|
||||
nz.coefs$type[grepl("^20[0-9]{2}$", nz.coefs$coef)] <- "year"
|
||||
|
||||
|
||||
# cleanup
|
||||
nz.coefs$coef <- gsub("source_title", "", nz.coefs$coef)
|
||||
nz.coefs$coef <- gsub("subject.x", "", nz.coefs$coef)
|
||||
nz.coefs$coef <- gsub("affiliation","", nz.coefs$coef)
|
||||
nz.coefs$beta <- round(nz.coefs$beta, 3)
|
||||
names(nz.coefs) <- c("Feature", "Type", "Coefficient")
|
||||
|
||||
predict.list$nz.coefs <- nz.coefs
|
||||
|
||||
# table for all
|
||||
round(prop.table(table(nz.coefs$Type))*100, 2)
|
||||
|
||||
# for top subsets
|
||||
round(prop.table(table(nz.coefs$Type[1:700]))*100, 2)
|
||||
round(prop.table(table(nz.coefs$Type[1:200]))*100, 2)
|
||||
round(prop.table(table(nz.coefs$Type[1:100]))*100, 2)
|
||||
|
||||
print(xtable(
|
||||
as.matrix(head(nz.coefs, 10)),
|
||||
label='tab:nzcoefs',
|
||||
caption='Feature, variable type, and beta value for top 100 non-zero coefficients estimated by the best fitting model with all features included.',
|
||||
align='lllr'
|
||||
), include.rownames=FALSE)
|
||||
|
||||
|
||||
# output
|
||||
save(predict.list, file="paper/data/prediction.RData")
|
||||
|
||||
|
||||
13
code/prediction/utils.R
Normal file
13
code/prediction/utils.R
Normal file
@@ -0,0 +1,13 @@
|
||||
|
||||
# Use this to check for underpopulated cells
|
||||
gen.counts <- function(df, c.var){
|
||||
tapply(df[,"eid"], c.var, function(x) length(unique(x)))
|
||||
}
|
||||
|
||||
# use this to remove underpopulated cells
|
||||
restrict <- function(df, c.var, c.min){
|
||||
var.counts <- gen.counts(df, c.var)
|
||||
out.df <- df[c.var %in% names(var.counts[var.counts >
|
||||
c.min]),]
|
||||
return(out.df)
|
||||
}
|
||||
126
code/topic_modeling/00_topics_extraction.py
Normal file
126
code/topic_modeling/00_topics_extraction.py
Normal file
@@ -0,0 +1,126 @@
|
||||
|
||||
from time import time
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from sklearn.decomposition import NMF, LatentDirichletAllocation
|
||||
import sys
|
||||
import csv
|
||||
import pandas as pd
|
||||
import argparse
|
||||
|
||||
"""
|
||||
This code was inspired/copied from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html.
|
||||
|
||||
It takes in an abstract file, and creates two outputs: The abstracts together with their topic distribution and a set of topics and the top words associated with each.
|
||||
"""
|
||||
|
||||
n_samples = None # Enter an integer here for testing.
|
||||
n_features = 20000
|
||||
n_topics = 12
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Program to use LDA to create topics and topic distributions from a set of abstracts.')
|
||||
parser.add_argument('-i', help='Abstracts file',
|
||||
default='processed_data/abstracts.tsv')
|
||||
parser.add_argument('-o', help='Where to output results',
|
||||
default='processed_data/abstracts_LDA.csv')
|
||||
parser.add_argument('-t', help='Where to output topics and top words associated with them',
|
||||
default='processed_data/top_words.csv')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading dataset...")
|
||||
t0 = time()
|
||||
dataset, doc_data = get_abstracts(args.i)
|
||||
data_samples = dataset[:n_samples]
|
||||
doc_data = doc_data[:n_samples]
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
# Use tf (raw term count) features for LDA.
|
||||
print("Extracting tf features for LDA...")
|
||||
tf_vectorizer = CountVectorizer(max_df=0.95, # Terms that show up in > max_df of documents are ignored
|
||||
min_df=2, # Terms that show up in < min_df of documents are ignored
|
||||
max_features=n_features, # Only use the top max_features
|
||||
stop_words='english',
|
||||
ngram_range=(1,2))
|
||||
t0 = time()
|
||||
tf = tf_vectorizer.fit_transform(data_samples)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
|
||||
print("Fitting LDA models with tf features, "
|
||||
"n_samples=%d and n_features=%d..."
|
||||
% (len(data_samples), n_features))
|
||||
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
|
||||
learning_method='online',
|
||||
learning_offset=50.,
|
||||
random_state=2017,
|
||||
n_jobs=2)
|
||||
t0 = time()
|
||||
model = lda.fit(tf)
|
||||
transformed_model = lda.fit_transform(tf)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
|
||||
# Change the values into a probability distribution for each abstract
|
||||
topic_dist = [[topic/sum(abstract_topics) for topic in abstract_topics]
|
||||
for abstract_topics in transformed_model]
|
||||
|
||||
# Make the topic distribution into a dataframe
|
||||
td = pd.DataFrame(topic_dist)
|
||||
# Get the feature names (i.e., the words/terms)
|
||||
tf_feature_names = tf_vectorizer.get_feature_names()
|
||||
|
||||
|
||||
# Get the top words by topic
|
||||
topic_words = get_top_words(lda, tf_feature_names, 20)
|
||||
# Sort by how often topic is used
|
||||
topic_words = topic_words.reindex_axis(sorted(topic_words.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
|
||||
|
||||
# Rearrange the columns by how often each topic is used
|
||||
td = td.reindex_axis(sorted(td.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
|
||||
|
||||
topic_words.to_csv(args.t, index=False)
|
||||
|
||||
df = pd.DataFrame(doc_data)
|
||||
df = df.join(td)
|
||||
|
||||
df.to_csv(args.o, index=False)
|
||||
|
||||
def get_abstracts(fn):
|
||||
with open(fn, 'r') as f:
|
||||
in_csv = csv.DictReader(f, delimiter='\t')
|
||||
abstracts = []
|
||||
doc_data = []
|
||||
for r in in_csv:
|
||||
try:
|
||||
curr_abstract = r['abstract']
|
||||
# If this isn't really an abstract, then don't add it
|
||||
if len(curr_abstract) > 5:
|
||||
# Add the abstracts to the corpus, and save the data
|
||||
abstracts.append(r['abstract'])
|
||||
doc_data.append(r)
|
||||
except KeyError:
|
||||
print(r)
|
||||
return abstracts, doc_data
|
||||
|
||||
def get_top_words(model, feature_names, n_top_words):
|
||||
'''Takes the model, the words used, and the number of words requested.
|
||||
Returns a dataframe of the top n_top_words for each topic'''
|
||||
r = pd.DataFrame()
|
||||
# For each topic
|
||||
for i, topic in enumerate(model.components_):
|
||||
# Get the top feature names, and put them in that column
|
||||
r[i] = [add_quotes(feature_names[i])
|
||||
for i in topic.argsort()[:-n_top_words - 1:-1]]
|
||||
return r
|
||||
|
||||
def add_quotes(s):
|
||||
'''Adds quotes around multiple term phrases'''
|
||||
if " " in s:
|
||||
s = '"{}"'.format(s)
|
||||
return s
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
103
code/topic_modeling/01_make_paper_files.py
Normal file
103
code/topic_modeling/01_make_paper_files.py
Normal file
@@ -0,0 +1,103 @@
|
||||
'''Creates the figures and tables for LaTeX'''
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import datetime
|
||||
import argparse
|
||||
import os
|
||||
|
||||
topic_names = [
|
||||
'Media Use',
|
||||
'Social Network Analysis',
|
||||
'Consumer Analsyis',
|
||||
'Education',
|
||||
'Quantitative Analysis',
|
||||
'Information Spread',
|
||||
'Health',
|
||||
'Sentiment Analysis',
|
||||
'News',
|
||||
'HCI',
|
||||
'Influence',
|
||||
'Methodology'
|
||||
]
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics')
|
||||
parser.add_argument('-a', help='Abstracts LDA file',
|
||||
default='processed_data/abstracts_LDA.csv')
|
||||
parser.add_argument('-w', help='Top words file',
|
||||
default='processed_data/top_words.csv')
|
||||
parser.add_argument('-t', help='Topic tables directory',
|
||||
default='paper/tables/')
|
||||
parser.add_argument('-o', help = 'RData output file location',
|
||||
default = 'paper/data/topic_model_data.RData')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Make the top_words tables
|
||||
tw = pd.read_csv(args.w)
|
||||
# Add names
|
||||
tw.columns = topic_names
|
||||
# Save as 2 different tables, because they are too long
|
||||
if not os.path.exists(args.t):
|
||||
os.makedirs(args.t)
|
||||
tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6])
|
||||
tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:])
|
||||
|
||||
# Load the abstracts and topics data
|
||||
df = pd.read_csv(args.a)
|
||||
n_topics = len(tw.columns)
|
||||
# Change to datetime
|
||||
df.date = pd.to_datetime(df.date)
|
||||
|
||||
# Remove papers from 2016 since we don't have the entire year, so graphs are misleading
|
||||
df = df[df.date <= pd.to_datetime('2015-12-31')]
|
||||
df = df.set_index('date')
|
||||
# Rename the last columns as the topic names
|
||||
df.columns = list(df.columns[:-n_topics]) + topic_names
|
||||
# Group by year, and get only the LDA columns
|
||||
topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]]
|
||||
# Get summary statistics for each topic
|
||||
# Total amount published in each topic by year
|
||||
topic_sums = topics_by_year.sum()
|
||||
# Mean amount published in each topic
|
||||
topic_means = topics_by_year.mean()
|
||||
# Now, we weight the contributions by how much a paper has been cited.
|
||||
# Remember, each document has a distribution of topics that it belongs to, so a given document might look like:
|
||||
# T1: .5
|
||||
# T2: .3
|
||||
# T3: 0
|
||||
# T4: .2
|
||||
# To account for how influential a paper is, we take all of the topic columns for a document
|
||||
# and multiplies their weights by the logged citations the paper has received.
|
||||
citation_weighted_topics = df[df.columns[-n_topics:]]
|
||||
citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0)
|
||||
weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum()
|
||||
|
||||
## write data to R
|
||||
# import code to write r modules and create our variable we'll write to
|
||||
import rpy2.robjects as robjects
|
||||
from rpy2.robjects import pandas2ri
|
||||
pandas2ri.activate()
|
||||
|
||||
|
||||
r = {'weighted_sums' : weighted_sums,
|
||||
'topic_sums' : topic_sums,
|
||||
'topic_means' : topic_means }
|
||||
|
||||
for var_name, x in r.items():
|
||||
robjects.r.assign(var_name.replace("_", "."), x)
|
||||
|
||||
if not os.path.exists(os.path.dirname(args.o)):
|
||||
os.makedirs(os.path.dirname(args.o))
|
||||
|
||||
robjects.r('save({},file = "{}")'.format(
|
||||
",".join([k.replace("_", ".") for k in r.keys()]),
|
||||
args.o
|
||||
))
|
||||
robjects.r("rm(list=ls())")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user