initial import of material for public archive into git

We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00
commit dd420c77de
41 changed files with 7069 additions and 0 deletions
--- a/code/bibliometrics/00_citation_network_analysis.py
+++ b/code/bibliometrics/00_citation_network_analysis.py
@@ -0,0 +1,232 @@
+# coding: utf-8
+# # Import data and get things setup
+
+import random
+random.seed(9001)
+
+# import code to write r modules and create our variable we'll write to
+import rpy2.robjects as robjects
+from rpy2.robjects import pandas2ri
+pandas2ri.activate()
+
+r = {}
+def remember(name, x):
+    r[name] = x
+
+# load in modules we'll need for analysis
+import subprocess
+import csv
+from igraph import *
+import pandas as pd
+import numpy as np
+import re
+
+# grab the largest connected compontent with a little function
+def get_largest_component(g):
+    g_components = g.components(mode="WEAK")
+    max_size = max(g_components.sizes())
+    for g_tmp in g_components.subgraphs():
+        if g_tmp.vcount() == max_size:
+            return(g_tmp)
+
+# look the full edgelist into igraph
+def edge_list_iter(df):
+    for i, row in df.iterrows():
+        yield (row['from'], row['to'])
+
+# list top 5 journals for each of the clusters
+def top_journals_for_clusters(clu):
+    articles_tmp = pd.merge(clu, articles[['eid', 'source_title']])
+    
+    output = pd.DataFrame()
+    for cid in articles_tmp['cluster'].unique():
+        journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5)
+        tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts })        
+        output = output.append(tmp)
+
+    output = output.reset_index()
+    output = output.rename(columns = {'index' : "journal"})
+    return(output)
+
+def infomap_edgelist(g, edgelist_filename, directed=True):
+    nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index, 
+                                'eid' : v['name']} for v in g.vs ])
+
+    # write out the edgelist to an external file so we can call infomap on it
+    with open("code/bibliometrics/" + edgelist_filename + ".txt", 'w') as f:
+        for e in g.es:
+            if e.source != e.target:
+                if 'weight' in e.attributes():
+                    print("{}\t{}\t{}".format(e.source, e.target, e['weight']), file=f)
+                else:
+                    print("{}\t{}".format(e.source, e.target), file=f)
+
+                    
+    # run the external program to generate the infomap clustering
+    infomap_cmdline = ["code/bibliometrics/infomap/Infomap", "code/bibliometrics/" + edgelist_filename + ".txt", "code/bibliometrics/output_dir -z --map --clu --tree"]
+    if directed:
+        infomap_cmdline.append("-d")
+    subprocess.call(infomap_cmdline)
+
+    # load up the clu data
+    clu = pd.read_csv("code/bibliometrics/output_dir/" + edgelist_filename + ".clu",
+                      header=None, comment="#", delim_whitespace=True)
+    clu.columns = ['node_infomap', 'cluster', 'flow']
+    
+    return pd.merge(clu, nodes_tmp, on="node_infomap")
+
+
+def write_graphml(g, clu, graphml_filename):
+    clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap')
+    g.vs["cluster"] =  clu["cluster"].tolist()
+    g.write_graphml("code/bibliometrics/" + graphml_filename)
+
+
+# load article data
+articles = pd.read_csv("processed_data/abstracts.tsv", delimiter="\t")
+
+# # network for just the central "social media" set
+
+# this contains the list of all INCOMING citations to for paper in the original set
+raw_edgelist = pd.read_csv("processed_data/social_media_edgelist.txt", delimiter="\t")
+
+g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True)
+
+
+g_sm = get_largest_component(g_sm_all)
+g_sm = g_sm.simplify()
+
+g_sm_clu = infomap_edgelist(g_sm, "sm_edgelist_infomap", directed=True)
+
+g_sm_clu['cluster'].value_counts()
+
+write_graphml(g_sm, g_sm_clu, "g_sm.graphml")
+
+
+# # larger network that contains the incoming cites to citing articles
+
+# this contains the list of all INCOMING citations to everything in the original set
+# plus every INCOMING citation to every paper that cites one of those papers
+raw_edgelist_files = ["processed_data/citation_edgelist.txt",
+                      "processed_data/social_media_edgelist.txt"]
+combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter="\t") for x in raw_edgelist_files])
+
+
+g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True)
+
+g_full = get_largest_component(g_full_all)
+g_full = g_full.simplify()
+
+
+g_full_clu = infomap_edgelist(g_full, "citation_edglist_infomap", directed=True)
+
+
+g_full_clu['cluster'].value_counts()
+
+top_journals_for_clusters(g_full_clu)
+
+write_graphml(g_full, g_full_clu, "g_full.graphml")
+
+
+# # create the meta-network of connections between clusters
+
+edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[["eid", "cluster"]], how="inner", left_on="to", right_on="eid")
+edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'})
+edgelist_tmp.drop('eid', 1, inplace=True)
+                                          
+edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[["eid", "cluster"]], how="inner", left_on="from", right_on="eid")
+edgelist_tmp = edgelist_tmp.rename(columns={"cluster" : 'from_cluster'})
+edgelist_tmp.drop('eid', 1, inplace=True)
+
+edgelist_tmp = edgelist_tmp[["to_cluster", "from_cluster"]]
+edgelist_tmp = edgelist_tmp[edgelist_tmp["to_cluster"] != edgelist_tmp["from_cluster"]]
+
+cluster_edgelist = pd.crosstab(edgelist_tmp["to_cluster"], edgelist_tmp["from_cluster"])
+cluster_edgelist["to_cluster"] = cluster_edgelist.index
+
+cluster_edgelist = pd.melt(cluster_edgelist, id_vars=["to_cluster"])
+cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']]
+
+remember("cluster_edgelist", cluster_edgelist)
+
+top_clusters = g_sm_clu["cluster"].value_counts().head(6).index
+
+# write the edgelist for the total number of clusters (currently 1-6)
+cluster_edgelist_output = cluster_edgelist[(cluster_edgelist["to_cluster"].isin(top_clusters)) &
+                                           (cluster_edgelist["from_cluster"].isin(top_clusters))]
+
+cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output["value"] > 0]
+
+g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[["from_cluster", "to_cluster"]].values], directed=True)
+g_cluster.es["weight"] = cluster_edgelist_output["value"].tolist()
+
+# assign the number of total articles as an attribute for each node
+g_cluster.vs["papers"] = g_sm_clu["cluster"].value_counts()[[x["name"] for x in g_cluster.vs]].tolist()
+
+g_cluster.write_graphml("code/bibliometrics/clusters.graphml")
+
+# # create network stats for tables (overall and within clusters)
+
+def create_network_stats(g):
+    network_stats = pd.DataFrame({'eid' : g.vs['name'],
+                                  'eig_cent' : g.eigenvector_centrality(),
+                                  'indegree' : g.indegree(),
+                                  'betweenness' : g.betweenness()})
+
+    network_stats = pd.merge(network_stats,
+                             articles[['eid', 'title', 'source_title']],
+                             how="inner")
+    return network_stats
+
+network_stats = create_network_stats(g_full)
+
+network_stats.sort_values("indegree", ascending=False).head(4)
+
+network_stats.sort_values("eig_cent", ascending=False).head(4)
+
+network_stats.sort_values("betweenness", ascending=False).head(4)
+
+# # things to store
+remember('total_articles', articles.shape[0])
+
+# total number of citations in the sm dataset
+remember('sm_citations', raw_edgelist.shape[0])
+
+remember('sm_citing', len(raw_edgelist["from"].unique()))
+
+# the number of articles in the original dataset that have any INCOMING citations
+remember('sm_cited', len(raw_edgelist["to"].unique()))
+
+# total number of citations in the sm dataset
+remember('all_citations', combo_raw_edgelist.shape[0])
+
+remember('all_citing', len(combo_raw_edgelist["from"].unique()))
+
+# the number of articles in the original dataset that have any INCOMING citations
+remember('all_cited', len(combo_raw_edgelist["to"].unique()))
+
+remember('g_sm_clusters', g_sm_clu[["eid", "cluster"]])
+
+sorted(r.keys())
+
+#save the r function to rdata file
+def save_to_r(r_dict, filename="output.RData"):
+    for var_name, x in r.items():
+        var_name = var_name.replace('_', '.')
+        if type(x) == np.int64:
+            x = np.asscalar(x)
+        
+        if type(x) == pd.DataFrame:
+            rx = pandas2ri.py2ri(x)
+        else:
+            rx = x
+        
+        robjects.r.assign(var_name, x)
+
+        # create a new variable called in R
+    robjects.r("r <- sapply(ls(), function (x) {eval(parse(text=x))})")
+    robjects.r('save("r", file="{}")'.format(filename))
+    robjects.r("rm(list=ls())")
+    
+save_to_r(r, "paper/data/network_data.RData")
+