diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py
index a7d2027..794e82d 100644
--- a/text_analysis/topicModel.py
+++ b/text_analysis/topicModel.py
@@ -4,6 +4,7 @@ import pandas as pd
 import glob
 import copy
 from statistics import mean, median
+from strip_markdown import strip_markdown
 
 from getMetadata import metadata_for_file
 
@@ -51,6 +52,10 @@ def preprocess(corpus_list):
     "lineno", "python", "php", "ruby", "api"]
     stopwords.extend(specific_stopwords)
     D = copy.copy(corpus_list)
+    #stripping markdown from documents
+    D = [strip_markdown(doc) for doc in D]
+    #strip html 
+    D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]
     #mvp right now, can certainly be expanded as iterations of text analysis are done
     D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]
     lemmatizer = WordNetLemmatizer()
@@ -76,7 +81,7 @@ def text_preparation(lemmatized_text):
 #TODO: identify best LDA model here
 def lda_model_identification(data_vectorized):
     lda = LatentDirichletAllocation()
-    search_params = {'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9], 'max_iter': [10, 20, 50], 'batch_size':[128, 256]}
+    search_params = {'n_components': [3, 5, 10, 15, 20, 25, 30]}
     model = GridSearchCV(lda, param_grid=search_params, verbose=10)
     model.fit(data_vectorized)
     best_lda_model = model.best_estimator_
@@ -88,7 +93,7 @@ def lda_model_identification(data_vectorized):
 def best_lda_model(data_vectorized, vocab):
     #Best Log Likelihood Score:  -502085.9749390023
     #Model Perplexity:  1689.0943431883845
-    lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50)
+    lda = LatentDirichletAllocation(n_components=3, learning_decay = 0.9, batch_size = 128, max_iter = 50)
     id_topic = lda.fit_transform(data_vectorized)
     topic_words = {}
     for topic, comp in enumerate(lda.components_):