import csv import os import nltk import pandas as pd #from nltk.stem import WordNetLemmatizer nltk.download('wordnet') def main(): instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" all_word_counts = [] for filename in os.listdir(instructions_dir): instructions_metadata = {} print(filename) with open(instructions_dir + filename, "r") as file: word_list = file.read().split() word_count = len(word_list) lemmatized_words = [] for word in word_list: lemma_word = nltk.WordNetLemmatizer().lemmatize(word) if lemma_word not in lemmatized_words: lemmatized_words.append(lemma_word) # pulling whether or not keywords like "Checklist" or "Process" occur? # pulling whether "HOWTO" occurs? unique_word_count = len(lemmatized_words) if "checklist" in lemmatized_words or "process" in lemmatized_words: print('contains keyword') print(word_count) all_word_counts.append(unique_word_count) print(sum(all_word_counts)/len(all_word_counts)) def consolidate_csv(): instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" all_word_counts = [] total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv") list_of_links = total_underprod_csv["upstream_vcs_link"].tolist() columns = list(total_underprod_csv.columns) columns.append("inst_filepath") total_count = 0 success_count = 0 with open("kk_final_doclist_roster.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, columns) writer.writerow(columns) for filename in os.listdir(instructions_dir): total_count += 1 row_value = [] cleaning_files = "_inst.md" pkg_name = filename[:-len(cleaning_files)] print(pkg_name) for item in list_of_links: if pkg_name in item: row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] row_value.append(instructions_dir + filename) print(row_value) writer.writerow(row_value) def consolidate_csv_2(): rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv") list_of_links = total_underprod_csv["upstream_vcs_link"].tolist() columns = list(total_underprod_csv.columns) columns.append("rost_filepath") total_count=0 with open("kk_final_rosterslist.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, columns) writer.writerow(columns) for filename in os.listdir(rosters_dir): total_count += 1 row_value = [] cleaning_files = "_contrib.json" pkg_name = filename[:-len(cleaning_files)] print(pkg_name) for item in list_of_links: if pkg_name in item: row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] row_value.append(rosters_dir + filename) print(row_value) writer.writerow(row_value) if __name__ == "__main__": consolidate_csv_2()