initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
126
code/topic_modeling/00_topics_extraction.py
Normal file
126
code/topic_modeling/00_topics_extraction.py
Normal file
@@ -0,0 +1,126 @@
|
||||
|
||||
from time import time
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from sklearn.decomposition import NMF, LatentDirichletAllocation
|
||||
import sys
|
||||
import csv
|
||||
import pandas as pd
|
||||
import argparse
|
||||
|
||||
"""
|
||||
This code was inspired/copied from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html.
|
||||
|
||||
It takes in an abstract file, and creates two outputs: The abstracts together with their topic distribution and a set of topics and the top words associated with each.
|
||||
"""
|
||||
|
||||
n_samples = None # Enter an integer here for testing.
|
||||
n_features = 20000
|
||||
n_topics = 12
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Program to use LDA to create topics and topic distributions from a set of abstracts.')
|
||||
parser.add_argument('-i', help='Abstracts file',
|
||||
default='processed_data/abstracts.tsv')
|
||||
parser.add_argument('-o', help='Where to output results',
|
||||
default='processed_data/abstracts_LDA.csv')
|
||||
parser.add_argument('-t', help='Where to output topics and top words associated with them',
|
||||
default='processed_data/top_words.csv')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading dataset...")
|
||||
t0 = time()
|
||||
dataset, doc_data = get_abstracts(args.i)
|
||||
data_samples = dataset[:n_samples]
|
||||
doc_data = doc_data[:n_samples]
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
# Use tf (raw term count) features for LDA.
|
||||
print("Extracting tf features for LDA...")
|
||||
tf_vectorizer = CountVectorizer(max_df=0.95, # Terms that show up in > max_df of documents are ignored
|
||||
min_df=2, # Terms that show up in < min_df of documents are ignored
|
||||
max_features=n_features, # Only use the top max_features
|
||||
stop_words='english',
|
||||
ngram_range=(1,2))
|
||||
t0 = time()
|
||||
tf = tf_vectorizer.fit_transform(data_samples)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
|
||||
print("Fitting LDA models with tf features, "
|
||||
"n_samples=%d and n_features=%d..."
|
||||
% (len(data_samples), n_features))
|
||||
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
|
||||
learning_method='online',
|
||||
learning_offset=50.,
|
||||
random_state=2017,
|
||||
n_jobs=2)
|
||||
t0 = time()
|
||||
model = lda.fit(tf)
|
||||
transformed_model = lda.fit_transform(tf)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
|
||||
# Change the values into a probability distribution for each abstract
|
||||
topic_dist = [[topic/sum(abstract_topics) for topic in abstract_topics]
|
||||
for abstract_topics in transformed_model]
|
||||
|
||||
# Make the topic distribution into a dataframe
|
||||
td = pd.DataFrame(topic_dist)
|
||||
# Get the feature names (i.e., the words/terms)
|
||||
tf_feature_names = tf_vectorizer.get_feature_names()
|
||||
|
||||
|
||||
# Get the top words by topic
|
||||
topic_words = get_top_words(lda, tf_feature_names, 20)
|
||||
# Sort by how often topic is used
|
||||
topic_words = topic_words.reindex_axis(sorted(topic_words.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
|
||||
|
||||
# Rearrange the columns by how often each topic is used
|
||||
td = td.reindex_axis(sorted(td.columns, key = lambda x: td[x].sum(), reverse=True),axis=1)
|
||||
|
||||
topic_words.to_csv(args.t, index=False)
|
||||
|
||||
df = pd.DataFrame(doc_data)
|
||||
df = df.join(td)
|
||||
|
||||
df.to_csv(args.o, index=False)
|
||||
|
||||
def get_abstracts(fn):
|
||||
with open(fn, 'r') as f:
|
||||
in_csv = csv.DictReader(f, delimiter='\t')
|
||||
abstracts = []
|
||||
doc_data = []
|
||||
for r in in_csv:
|
||||
try:
|
||||
curr_abstract = r['abstract']
|
||||
# If this isn't really an abstract, then don't add it
|
||||
if len(curr_abstract) > 5:
|
||||
# Add the abstracts to the corpus, and save the data
|
||||
abstracts.append(r['abstract'])
|
||||
doc_data.append(r)
|
||||
except KeyError:
|
||||
print(r)
|
||||
return abstracts, doc_data
|
||||
|
||||
def get_top_words(model, feature_names, n_top_words):
|
||||
'''Takes the model, the words used, and the number of words requested.
|
||||
Returns a dataframe of the top n_top_words for each topic'''
|
||||
r = pd.DataFrame()
|
||||
# For each topic
|
||||
for i, topic in enumerate(model.components_):
|
||||
# Get the top feature names, and put them in that column
|
||||
r[i] = [add_quotes(feature_names[i])
|
||||
for i in topic.argsort()[:-n_top_words - 1:-1]]
|
||||
return r
|
||||
|
||||
def add_quotes(s):
|
||||
'''Adds quotes around multiple term phrases'''
|
||||
if " " in s:
|
||||
s = '"{}"'.format(s)
|
||||
return s
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user