diff --git a/text_analysis/partitioning_files.py b/text_analysis/partitioning_files.py new file mode 100644 index 0000000..7303d6f --- /dev/null +++ b/text_analysis/partitioning_files.py @@ -0,0 +1,79 @@ +import csv +import io +import shutil +import os +from tqdm import tqdm +from pathlib import Path + + +working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/1/" +dest_dir = "/data/users/mgaughan/kkex/time_specific_files/readme3/test/" + +def move_file(vcs_link, group): + if "github" in vcs_link or "gitlab" in vcs_link: + #making an evaluation that sub branches aren't being used and that people would fork if needed + #this only looks at main + vcs_link = "/".join(vcs_link.split("/")[0:5]) + full_temp_path = vcs_link.split('/')[4] + ".git" + #other_temp_path = vcs_link.split('/')[4] + ".git0" + else: + full_temp_path = vcs_link.split('/')[- 1] + ".git" + #other_temp_path = vcs_link.split('/')[- 1] + ".git0" + rel_filename = "" + for filename in os.listdir(working_dir): + spec, rel_filename = hard_codes(vcs_link, filename) + project = "_".join(filename.split("_")[:-1]) + #print(project) + if full_temp_path[:-4] == project: + rel_filename = filename + if rel_filename != "": + target_path = working_dir + rel_filename + #print(target_path) + destination_path = dest_dir + str(group) + "/" + shutil.move(target_path, destination_path) + +def hard_codes(vcs_link, filename): + if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/sleuthkit/sleuthkit" and filename == "sleuthkit_README_win32.txt": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/metlov/cycle.git" and filename == "cycle.git_README_ru.html": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/winchen/engauge_debian" and filename == "engauge_debian_README_for_osx": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/babelouest/yder" and filename == "yder_README_8md.html": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/SebWouters/CheMPS2" and filename == "CheMPS2_README_8md_source.html": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/TACC/Lmod" and filename == "Lmod_README_lua_modulefiles.txt": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/hunspell/hyphen.git" and filename == "hyphen.git_README_hyph_en_US.txt": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/greenbone/openvas" and filename == "openvas_UPGRADE_README": + rel_filename = filename + return True, rel_filename + if vcs_link == "https://github.com/MightyCreak/diffuse.git" and filename == "diffuse.git_README_ru": + rel_filename = filename + return True, rel_filename + return False, "" + +def for_all_files(csv_path): + with open(csv_path, 'r') as file: + lines = [line for line in file] + index = -1 + for row in tqdm(csv.reader(lines), total=len(lines)): + index += 1 + if index == 0: + continue + move_file(row[0], index % 2 + 1) + +if __name__ == "__main__": + for_all_files("final_data/deb_readme_did.csv") \ No newline at end of file diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py index 794e82d..91677c0 100644 --- a/text_analysis/topicModel.py +++ b/text_analysis/topicModel.py @@ -81,7 +81,7 @@ def text_preparation(lemmatized_text): #TODO: identify best LDA model here def lda_model_identification(data_vectorized): lda = LatentDirichletAllocation() - search_params = {'n_components': [3, 5, 10, 15, 20, 25, 30]} + search_params = {'n_components': [8], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] } model = GridSearchCV(lda, param_grid=search_params, verbose=10) model.fit(data_vectorized) best_lda_model = model.best_estimator_ @@ -93,7 +93,7 @@ def lda_model_identification(data_vectorized): def best_lda_model(data_vectorized, vocab): #Best Log Likelihood Score: -502085.9749390023 #Model Perplexity: 1689.0943431883845 - lda = LatentDirichletAllocation(n_components=3, learning_decay = 0.9, batch_size = 128, max_iter = 50) + lda = LatentDirichletAllocation(n_components=8, learning_decay = 0.9, batch_size = 128, max_iter = 50) id_topic = lda.fit_transform(data_vectorized) topic_words = {} for topic, comp in enumerate(lda.components_): @@ -104,7 +104,6 @@ def best_lda_model(data_vectorized, vocab): print(' %s' % ', '.join(words)) #lda.print_topics(num_words=10) -#TODO: evaluate model and identified topics if __name__ == "__main__":