preparing for grouped topic analysis

2024-05-08 12:09:00 -05:00 · 2024-05-08 12:09:00 -05:00 · 618f225c38
commit 618f225c38
parent 4e2075fdf0
2 changed files with 81 additions and 3 deletions
--- a/text_analysis/partitioning_files.py
+++ b/text_analysis/partitioning_files.py
@ -0,0 +1,79 @@
+import csv
+import io 
+import shutil 
+import os 
+from tqdm import tqdm
+from pathlib import Path
+
+
+working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/1/"
+dest_dir = "/data/users/mgaughan/kkex/time_specific_files/readme3/test/"
+
+def move_file(vcs_link, group):
+    if "github" in vcs_link or "gitlab" in vcs_link:
+        #making an evaluation that sub branches aren't being used and that people would fork if needed
+        #this only looks at main
+        vcs_link = "/".join(vcs_link.split("/")[0:5])
+        full_temp_path = vcs_link.split('/')[4] + ".git"
+        #other_temp_path = vcs_link.split('/')[4] + ".git0"
+    else:
+        full_temp_path = vcs_link.split('/')[- 1] + ".git"
+        #other_temp_path = vcs_link.split('/')[- 1] + ".git0"
+    rel_filename = ""
+    for filename in os.listdir(working_dir):
+        spec, rel_filename = hard_codes(vcs_link, filename)
+        project = "_".join(filename.split("_")[:-1])
+        #print(project)
+        if full_temp_path[:-4] == project:
+            rel_filename = filename
+        if rel_filename != "":
+            target_path = working_dir + rel_filename
+            #print(target_path)
+            destination_path = dest_dir + str(group) + "/"
+            shutil.move(target_path, destination_path)
+
+def hard_codes(vcs_link, filename):
+    if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link == "https://github.com/sleuthkit/sleuthkit" and filename == "sleuthkit_README_win32.txt":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link == "https://github.com/metlov/cycle.git" and filename == "cycle.git_README_ru.html":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link ==  "https://github.com/winchen/engauge_debian" and filename == "engauge_debian_README_for_osx":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link == "https://github.com/babelouest/yder" and filename == "yder_README_8md.html":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link == "https://github.com/SebWouters/CheMPS2" and filename == "CheMPS2_README_8md_source.html":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link == "https://github.com/TACC/Lmod" and filename == "Lmod_README_lua_modulefiles.txt":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link == "https://github.com/hunspell/hyphen.git" and filename == "hyphen.git_README_hyph_en_US.txt":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link == "https://github.com/greenbone/openvas" and filename == "openvas_UPGRADE_README":
+        rel_filename = filename
+        return True, rel_filename
+    if vcs_link == "https://github.com/MightyCreak/diffuse.git" and filename == "diffuse.git_README_ru":
+        rel_filename = filename
+        return True, rel_filename
+    return False, ""
+
+def for_all_files(csv_path):
+    with open(csv_path, 'r') as file:
+        lines = [line for line in file]
+        index = -1
+        for row in tqdm(csv.reader(lines), total=len(lines)):
+            index += 1
+            if index == 0:
+                continue
+            move_file(row[0], index % 2 + 1)
+
+if __name__ == "__main__":
+    for_all_files("final_data/deb_readme_did.csv")
--- a/text_analysis/topicModel.py
+++ b/text_analysis/topicModel.py
@ -81,7 +81,7 @@ def text_preparation(lemmatized_text):
 #TODO: identify best LDA model here
 def lda_model_identification(data_vectorized):
    lda = LatentDirichletAllocation()
-    search_params = {'n_components': [3, 5, 10, 15, 20, 25, 30]}
+    search_params = {'n_components': [8], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256]  }
    model = GridSearchCV(lda, param_grid=search_params, verbose=10)
    model.fit(data_vectorized)
    best_lda_model = model.best_estimator_
@ -93,7 +93,7 @@ def lda_model_identification(data_vectorized):
 def best_lda_model(data_vectorized, vocab):
    #Best Log Likelihood Score:  -502085.9749390023
    #Model Perplexity:  1689.0943431883845
-    lda = LatentDirichletAllocation(n_components=3, learning_decay = 0.9, batch_size = 128, max_iter = 50)
+    lda = LatentDirichletAllocation(n_components=8, learning_decay = 0.9, batch_size = 128, max_iter = 50)
    id_topic = lda.fit_transform(data_vectorized)
    topic_words = {}
    for topic, comp in enumerate(lda.components_):
@ -104,7 +104,6 @@ def best_lda_model(data_vectorized, vocab):
        print('  %s' % ', '.join(words))
    #lda.print_topics(num_words=10)

-#TODO: evaluate model and identified topics 


 if __name__ == "__main__":