first draft of the topic model
This commit is contained in:
parent
793ddcc632
commit
9d9bb8da1e
@ -26,6 +26,7 @@ from nltk.corpus import stopwords
|
|||||||
from nltk.stem.wordnet import WordNetLemmatizer
|
from nltk.stem.wordnet import WordNetLemmatizer
|
||||||
|
|
||||||
stopwords = stopwords.words('english')
|
stopwords = stopwords.words('english')
|
||||||
|
#https://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html
|
||||||
|
|
||||||
#loading data in, getting misc descriptors
|
#loading data in, getting misc descriptors
|
||||||
def get_data_from_dir(directory):
|
def get_data_from_dir(directory):
|
||||||
@ -45,9 +46,13 @@ def get_data_from_dir(directory):
|
|||||||
|
|
||||||
#preprocessing text data
|
#preprocessing text data
|
||||||
def preprocess(corpus_list):
|
def preprocess(corpus_list):
|
||||||
|
#extending stopwords
|
||||||
|
specific_stopwords = ["http", "com", "www", "org", "file", "code", "time", "software", "use", "user", "set", "line", "run", "source", "github",
|
||||||
|
"lineno", "python", "php", "ruby", "api"]
|
||||||
|
stopwords.extend(specific_stopwords)
|
||||||
D = copy.copy(corpus_list)
|
D = copy.copy(corpus_list)
|
||||||
#mvp right now, can certainly be expanded as iterations of text analysis are done
|
#mvp right now, can certainly be expanded as iterations of text analysis are done
|
||||||
D = [[token for token in simple_preprocess(doc) if token not in stopwords]for doc in D]
|
D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]
|
||||||
lemmatizer = WordNetLemmatizer()
|
lemmatizer = WordNetLemmatizer()
|
||||||
D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]
|
D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]
|
||||||
return D_lemma
|
return D_lemma
|
||||||
@ -63,7 +68,7 @@ def text_preparation(lemmatized_text):
|
|||||||
D_bigrams[i].append(token)
|
D_bigrams[i].append(token)
|
||||||
#id2word
|
#id2word
|
||||||
id2word = corpora.Dictionary(D_bigrams)
|
id2word = corpora.Dictionary(D_bigrams)
|
||||||
id2word.filter_extremes(no_below=2, no_above=0.5)
|
id2word.filter_extremes(no_below=5, no_above=0.5)
|
||||||
#bow representation
|
#bow representation
|
||||||
bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]
|
bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]
|
||||||
return bag_of_words, id2word
|
return bag_of_words, id2word
|
||||||
@ -80,7 +85,7 @@ def lda_model_identification(data_vectorized):
|
|||||||
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
|
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
|
||||||
|
|
||||||
#TODO: implement best LDA model here
|
#TODO: implement best LDA model here
|
||||||
def best_lda_model(data_vectorized):
|
def best_lda_model(data_vectorized, vocab):
|
||||||
#Best Log Likelihood Score: -502085.9749390023
|
#Best Log Likelihood Score: -502085.9749390023
|
||||||
#Model Perplexity: 1689.0943431883845
|
#Model Perplexity: 1689.0943431883845
|
||||||
lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50)
|
lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50)
|
||||||
@ -92,6 +97,7 @@ def best_lda_model(data_vectorized):
|
|||||||
for topic, words in topic_words.items():
|
for topic, words in topic_words.items():
|
||||||
print('Topic: %d' % topic)
|
print('Topic: %d' % topic)
|
||||||
print(' %s' % ', '.join(words))
|
print(' %s' % ', '.join(words))
|
||||||
|
#lda.print_topics(num_words=10)
|
||||||
|
|
||||||
#TODO: evaluate model and identified topics
|
#TODO: evaluate model and identified topics
|
||||||
|
|
||||||
@ -114,6 +120,9 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
|
data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
|
||||||
#lda_model_identification(data_vectorized)
|
#lda_model_identification(data_vectorized)
|
||||||
best_lda_model(data_vectorized)
|
#freqs = zip(vectorizer.get_feature_names_out(), data_vectorized.sum(axis=0).tolist()[0])
|
||||||
|
# sort from largest to smallest
|
||||||
|
#print(sorted(freqs, key=lambda x: -x[1])[:25])
|
||||||
|
best_lda_model(data_vectorized, vectorizer.get_feature_names_out())
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user