import csv import os import nltk import pandas as pd from statistics import mean, median import json #from nltk.stem import WordNetLemmatizer nltk.download('wordnet') def main(): instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" all_word_counts = [] all_word_len = [] for filename in os.listdir(instructions_dir): instructions_metadata = {} print(filename) with open(instructions_dir + filename, "r") as file: word_list = file.read().split() word_count = len(word_list) lemmatized_words = [] for word in word_list: lemma_word = nltk.WordNetLemmatizer().lemmatize(word) if lemma_word not in lemmatized_words: lemmatized_words.append(lemma_word) # pulling whether or not keywords like "Checklist" or "Process" occur? # pulling whether "HOWTO" occurs unique_word_count = len(word_list) if "checklist" in lemmatized_words or "process" in lemmatized_words: print('contains keyword') print(word_count) all_word_counts.append(unique_word_count) doc_word_len = [] for word in word_list: doc_word_len.append(len(word)) all_word_len.append(sum(doc_word_len)/len(doc_word_len)) print(sum(all_word_len)/len(all_word_len)) print(sum(all_word_counts)/len(all_word_counts)) def consolidate_csv(): instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" all_word_counts = [] total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv") list_of_links = total_underprod_csv["upstream_vcs_link"].tolist() columns = list(total_underprod_csv.columns) columns.append("inst_filepath") total_count = 0 success_count = 0 with open("kk_final_doclist_roster.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, columns) writer.writerow(columns) for filename in os.listdir(instructions_dir): total_count += 1 row_value = [] cleaning_files = "_inst.md" pkg_name = filename[:-len(cleaning_files)] print(pkg_name) for item in list_of_links: if pkg_name in item: row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] row_value.append(instructions_dir + filename) print(row_value) writer.writerow(row_value) def consolidate_readme(): readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/" total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv") list_of_links = total_underprod_csv["upstream_vcs_link"].tolist() columns = list(total_underprod_csv.columns) columns.append("readme_filepath") total_count = 0 success_count = 0 with open("kk_final_readme_roster.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, columns) writer.writerow(columns) for filename in os.listdir(readme_dir): total_count += 1 row_value = [] cleaning_files = "_readme.md" pkg_name = filename[:-len(cleaning_files)] print(pkg_name) for item in list_of_links: if pkg_name in item: row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] row_value.append(readme_dir + filename) print(row_value) writer.writerow(row_value) def consolidate_csv_2(): rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv") list_of_links = total_underprod_csv["upstream_vcs_link"].tolist() columns = list(total_underprod_csv.columns) columns.append("rost_filepath") total_count=0 with open("kk_final_rosterslist.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, columns) writer.writerow(columns) for filename in os.listdir(rosters_dir): total_count += 1 row_value = [] cleaning_files = "_contrib.json" pkg_name = filename[:-len(cleaning_files)] print(pkg_name) for item in list_of_links: if pkg_name in item: row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] row_value.append(rosters_dir + filename) print(row_value) writer.writerow(row_value) def consolidate_csv_3(): rosters_dir = "/data/users/mgaughan/kkex_comment_data_121323/" total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv") list_of_links = total_underprod_csv["project_name"].tolist() columns = list(total_underprod_csv.columns) columns.append("comments_filepath") total_count=0 with open("kk_final_commentlist.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, columns) writer.writerow(columns) for filename in os.listdir(rosters_dir): total_count += 1 row_value = [] cleaning_files = "gh_comments_" pkg_name = filename[len(cleaning_files):-len(".json")] print(pkg_name) for item in list_of_links: if pkg_name == item: row_value = total_underprod_csv.loc[total_underprod_csv["project_name"] == item].values.tolist()[0] row_value.append(rosters_dir + filename) print(row_value) writer.writerow(row_value) def count_comments(): comment = 0 projects = 0 rosters_dir = "/data/users/mgaughan/kkex_comment_data_121323/" for filename in os.listdir(rosters_dir): try: file = json.load(open(rosters_dir + filename, "r")) except: continue try: comment += len(file['data']['repository']['issues']['edges']) print(len(file['data']['repository']['issues']['edges'])) project += 1 except: print("oops") print(comment) print(projects) def get_main_for_splice(): inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv") inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean']) instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" all_word_counts = [] all_word_len = [] all_header_counts = [] actual_index = 0 for index, row in inst_doc_df.iterrows(): actual_index += 1 if actual_index < 700: for filename in os.listdir(instructions_dir): instructions_metadata = {} if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]: with open(instructions_dir + filename, "r") as file: word_list = file.read().split() word_count = len(word_list) lemmatized_words = [] for word in word_list: lemma_word = nltk.WordNetLemmatizer().lemmatize(word) if lemma_word not in lemmatized_words: lemmatized_words.append(lemma_word) # pulling whether or not keywords like "Checklist" or "Process" occur? # pulling whether "HOWTO" occurs unique_word_count = len(word_list) print(word_count) all_word_counts.append(unique_word_count) doc_word_len = [] header_count = 0 for word in word_list: if "#" in word: header_count += 1 doc_word_len.append(len(word)) print(header_count) all_header_counts.append(header_count) all_word_len.append(sum(doc_word_len)/len(doc_word_len)) #print(sum(all_word_len)/len(all_word_len)) #print(sum(all_word_counts)/len(all_word_counts)) print(mean(all_header_counts)) print(median(all_header_counts)) if __name__ == "__main__": count_comments()