2024-01-15 17:55:54 +00:00
|
|
|
import csv
|
|
|
|
import os
|
|
|
|
import nltk
|
2024-01-28 17:59:18 +00:00
|
|
|
import pandas as pd
|
2024-02-23 16:25:13 +00:00
|
|
|
from statistics import mean, median
|
2024-01-15 17:55:54 +00:00
|
|
|
#from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
|
|
nltk.download('wordnet')
|
|
|
|
|
|
|
|
def main():
|
|
|
|
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
|
2024-01-22 17:20:52 +00:00
|
|
|
all_word_counts = []
|
2024-02-18 21:45:31 +00:00
|
|
|
all_word_len = []
|
2024-01-15 17:55:54 +00:00
|
|
|
for filename in os.listdir(instructions_dir):
|
|
|
|
instructions_metadata = {}
|
|
|
|
print(filename)
|
|
|
|
with open(instructions_dir + filename, "r") as file:
|
|
|
|
word_list = file.read().split()
|
|
|
|
word_count = len(word_list)
|
|
|
|
lemmatized_words = []
|
|
|
|
for word in word_list:
|
|
|
|
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
|
|
|
|
if lemma_word not in lemmatized_words:
|
|
|
|
lemmatized_words.append(lemma_word)
|
|
|
|
# pulling whether or not keywords like "Checklist" or "Process" occur?
|
2024-02-18 21:45:31 +00:00
|
|
|
# pulling whether "HOWTO" occurs
|
|
|
|
unique_word_count = len(word_list)
|
2024-01-22 17:20:52 +00:00
|
|
|
if "checklist" in lemmatized_words or "process" in lemmatized_words:
|
|
|
|
print('contains keyword')
|
2024-01-15 17:55:54 +00:00
|
|
|
print(word_count)
|
2024-01-22 17:20:52 +00:00
|
|
|
all_word_counts.append(unique_word_count)
|
2024-02-18 21:45:31 +00:00
|
|
|
doc_word_len = []
|
|
|
|
for word in word_list:
|
|
|
|
doc_word_len.append(len(word))
|
|
|
|
all_word_len.append(sum(doc_word_len)/len(doc_word_len))
|
|
|
|
print(sum(all_word_len)/len(all_word_len))
|
2024-01-22 17:20:52 +00:00
|
|
|
print(sum(all_word_counts)/len(all_word_counts))
|
2024-01-15 17:55:54 +00:00
|
|
|
|
2024-01-28 17:59:18 +00:00
|
|
|
def consolidate_csv():
|
|
|
|
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
|
|
|
|
all_word_counts = []
|
|
|
|
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
|
|
|
|
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
|
|
|
|
columns = list(total_underprod_csv.columns)
|
|
|
|
columns.append("inst_filepath")
|
|
|
|
total_count = 0
|
|
|
|
success_count = 0
|
|
|
|
with open("kk_final_doclist_roster.csv", 'w', newline='') as output_file:
|
|
|
|
writer = csv.writer(output_file, columns)
|
|
|
|
writer.writerow(columns)
|
|
|
|
for filename in os.listdir(instructions_dir):
|
|
|
|
total_count += 1
|
|
|
|
row_value = []
|
|
|
|
cleaning_files = "_inst.md"
|
|
|
|
pkg_name = filename[:-len(cleaning_files)]
|
|
|
|
print(pkg_name)
|
|
|
|
for item in list_of_links:
|
|
|
|
if pkg_name in item:
|
|
|
|
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
|
|
|
|
row_value.append(instructions_dir + filename)
|
|
|
|
print(row_value)
|
|
|
|
writer.writerow(row_value)
|
2024-02-23 16:25:13 +00:00
|
|
|
|
|
|
|
def consolidate_readme():
|
|
|
|
readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/"
|
|
|
|
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
|
|
|
|
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
|
|
|
|
columns = list(total_underprod_csv.columns)
|
|
|
|
columns.append("readme_filepath")
|
|
|
|
total_count = 0
|
|
|
|
success_count = 0
|
|
|
|
with open("kk_final_readme_roster.csv", 'w', newline='') as output_file:
|
|
|
|
writer = csv.writer(output_file, columns)
|
|
|
|
writer.writerow(columns)
|
|
|
|
for filename in os.listdir(readme_dir):
|
|
|
|
total_count += 1
|
|
|
|
row_value = []
|
|
|
|
cleaning_files = "_readme.md"
|
|
|
|
pkg_name = filename[:-len(cleaning_files)]
|
|
|
|
print(pkg_name)
|
|
|
|
for item in list_of_links:
|
|
|
|
if pkg_name in item:
|
|
|
|
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
|
|
|
|
row_value.append(readme_dir + filename)
|
|
|
|
print(row_value)
|
|
|
|
writer.writerow(row_value)
|
2024-01-28 17:59:18 +00:00
|
|
|
|
|
|
|
def consolidate_csv_2():
|
|
|
|
rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"
|
|
|
|
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
|
|
|
|
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
|
|
|
|
columns = list(total_underprod_csv.columns)
|
|
|
|
columns.append("rost_filepath")
|
|
|
|
total_count=0
|
|
|
|
with open("kk_final_rosterslist.csv", 'w', newline='') as output_file:
|
|
|
|
writer = csv.writer(output_file, columns)
|
|
|
|
writer.writerow(columns)
|
|
|
|
for filename in os.listdir(rosters_dir):
|
|
|
|
total_count += 1
|
|
|
|
row_value = []
|
|
|
|
cleaning_files = "_contrib.json"
|
|
|
|
pkg_name = filename[:-len(cleaning_files)]
|
|
|
|
print(pkg_name)
|
|
|
|
for item in list_of_links:
|
|
|
|
if pkg_name in item:
|
|
|
|
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
|
|
|
|
row_value.append(rosters_dir + filename)
|
|
|
|
print(row_value)
|
|
|
|
writer.writerow(row_value)
|
|
|
|
|
2024-03-04 02:34:58 +00:00
|
|
|
def consolidate_csv_3():
|
|
|
|
rosters_dir = "/data/users/mgaughan/kkex_comment_data_121323/"
|
|
|
|
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
|
|
|
|
list_of_links = total_underprod_csv["project_name"].tolist()
|
|
|
|
columns = list(total_underprod_csv.columns)
|
|
|
|
columns.append("comments_filepath")
|
|
|
|
total_count=0
|
|
|
|
with open("kk_final_commentlist.csv", 'w', newline='') as output_file:
|
|
|
|
writer = csv.writer(output_file, columns)
|
|
|
|
writer.writerow(columns)
|
|
|
|
for filename in os.listdir(rosters_dir):
|
|
|
|
total_count += 1
|
|
|
|
row_value = []
|
|
|
|
cleaning_files = "gh_comments_"
|
|
|
|
pkg_name = filename[len(cleaning_files):-len(".json")]
|
|
|
|
print(pkg_name)
|
|
|
|
for item in list_of_links:
|
|
|
|
if pkg_name == item:
|
|
|
|
row_value = total_underprod_csv.loc[total_underprod_csv["project_name"] == item].values.tolist()[0]
|
|
|
|
row_value.append(rosters_dir + filename)
|
|
|
|
print(row_value)
|
|
|
|
writer.writerow(row_value)
|
|
|
|
|
2024-02-23 16:25:13 +00:00
|
|
|
def get_main_for_splice():
|
|
|
|
inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv")
|
|
|
|
inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean'])
|
|
|
|
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
|
|
|
|
all_word_counts = []
|
|
|
|
all_word_len = []
|
|
|
|
all_header_counts = []
|
|
|
|
actual_index = 0
|
|
|
|
for index, row in inst_doc_df.iterrows():
|
|
|
|
actual_index += 1
|
|
|
|
if actual_index < 700:
|
|
|
|
for filename in os.listdir(instructions_dir):
|
|
|
|
instructions_metadata = {}
|
|
|
|
if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]:
|
|
|
|
with open(instructions_dir + filename, "r") as file:
|
|
|
|
word_list = file.read().split()
|
|
|
|
word_count = len(word_list)
|
|
|
|
lemmatized_words = []
|
|
|
|
for word in word_list:
|
|
|
|
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
|
|
|
|
if lemma_word not in lemmatized_words:
|
|
|
|
lemmatized_words.append(lemma_word)
|
|
|
|
# pulling whether or not keywords like "Checklist" or "Process" occur?
|
|
|
|
# pulling whether "HOWTO" occurs
|
|
|
|
unique_word_count = len(word_list)
|
|
|
|
print(word_count)
|
|
|
|
all_word_counts.append(unique_word_count)
|
|
|
|
doc_word_len = []
|
|
|
|
header_count = 0
|
|
|
|
for word in word_list:
|
|
|
|
if "#" in word:
|
|
|
|
header_count += 1
|
|
|
|
doc_word_len.append(len(word))
|
|
|
|
print(header_count)
|
|
|
|
all_header_counts.append(header_count)
|
|
|
|
all_word_len.append(sum(doc_word_len)/len(doc_word_len))
|
|
|
|
#print(sum(all_word_len)/len(all_word_len))
|
|
|
|
#print(sum(all_word_counts)/len(all_word_counts))
|
|
|
|
print(mean(all_header_counts))
|
|
|
|
print(median(all_header_counts))
|
2024-01-15 17:55:54 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-03-04 02:34:58 +00:00
|
|
|
consolidate_csv_3()
|