initial import of material for public archive into git

We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00
commit dd420c77de
41 changed files with 7069 additions and 0 deletions
--- a/code/topic_modeling/00_topics_extraction.py
+++ b/code/topic_modeling/00_topics_extraction.py
@@ -0,0 +1,126 @@
+
+from time import time
+
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.decomposition import NMF, LatentDirichletAllocation
+import sys
+import csv
+import pandas as pd
+import argparse
+
+"""
+This code was inspired/copied from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html.
+
+It takes in an abstract file, and creates two outputs: The abstracts together with their topic distribution and a set of topics and the top words associated with each.
+"""
+
+n_samples = None # Enter an integer here for testing.
+n_features = 20000
+n_topics = 12
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Program to use LDA to create topics and topic distributions from a set of abstracts.')
+    parser.add_argument('-i', help='Abstracts file',
+            default='processed_data/abstracts.tsv')
+    parser.add_argument('-o', help='Where to output  results',
+            default='processed_data/abstracts_LDA.csv')
+    parser.add_argument('-t', help='Where to output topics and top words associated with them',
+            default='processed_data/top_words.csv')
+    args = parser.parse_args()
+
+    print("Loading dataset...")
+    t0 = time()
+    dataset, doc_data = get_abstracts(args.i)
+    data_samples = dataset[:n_samples]
+    doc_data = doc_data[:n_samples]
+    print("done in %0.3fs." % (time() - t0))
+
+    # Use tf (raw term count) features for LDA.
+    print("Extracting tf features for LDA...")
+    tf_vectorizer = CountVectorizer(max_df=0.95, # Terms that show up in > max_df of documents are ignored
+                                    min_df=2, # Terms that show up in < min_df of documents are ignored
+                                    max_features=n_features, # Only use the top max_features 
+                                    stop_words='english',
+                                    ngram_range=(1,2))
+    t0 = time()
+    tf = tf_vectorizer.fit_transform(data_samples)
+    print("done in %0.3fs." % (time() - t0))
+
+
+    print("Fitting LDA models with tf features, "
+          "n_samples=%d and n_features=%d..."
+          % (len(data_samples), n_features))
+    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
+                                    learning_method='online',
+                                    learning_offset=50.,
+                                    random_state=2017,
+                                    n_jobs=2)
+    t0 = time()
+    model = lda.fit(tf)
+    transformed_model = lda.fit_transform(tf)
+    print("done in %0.3fs." % (time() - t0))
+
+
+    # Change the values into a probability distribution for each abstract
+    topic_dist = [[topic/sum(abstract_topics) for topic in abstract_topics]
+                          for abstract_topics in transformed_model]
+
+    # Make the topic distribution into a dataframe
+    td = pd.DataFrame(topic_dist)
+    # Get the feature names (i.e., the words/terms)
+    tf_feature_names = tf_vectorizer.get_feature_names()
+
+
+    # Get the top words by topic
+    topic_words = get_top_words(lda, tf_feature_names, 20)
+    # Sort by how often topic is used
+    topic_words = topic_words.reindex_axis(sorted(topic_words.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
+
+    # Rearrange the columns by how often each topic is used
+    td = td.reindex_axis(sorted(td.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
+
+    topic_words.to_csv(args.t, index=False)
+
+    df = pd.DataFrame(doc_data)
+    df = df.join(td)
+
+    df.to_csv(args.o, index=False)
+
+def get_abstracts(fn):
+    with open(fn, 'r') as f:
+        in_csv = csv.DictReader(f, delimiter='\t')
+        abstracts = []
+        doc_data = []
+        for r in in_csv:
+            try:
+                curr_abstract = r['abstract']
+                # If this isn't really an abstract, then don't add it
+                if len(curr_abstract) > 5:
+                    # Add the abstracts to the corpus, and save the data
+                    abstracts.append(r['abstract'])
+                    doc_data.append(r)
+            except KeyError:
+                print(r)
+    return abstracts, doc_data
+
+def get_top_words(model, feature_names, n_top_words):
+    '''Takes the model, the words used, and the number of words requested.
+    Returns a dataframe of the top n_top_words for each topic'''
+    r = pd.DataFrame()
+    # For each topic
+    for i, topic in enumerate(model.components_):
+        # Get the top feature names, and put them in that column
+        r[i] = [add_quotes(feature_names[i])
+                    for i in topic.argsort()[:-n_top_words - 1:-1]]
+    return r
+
+def add_quotes(s):
+    '''Adds quotes around multiple term phrases'''
+    if " " in s:
+        s =  '"{}"'.format(s)
+    return s
+
+
+if __name__ == '__main__':
+    main()