24_deb_pkg_gov/instructions_analysis.py

177 lines
7.8 KiB
Python

import csv
import os
import nltk
import pandas as pd
from statistics import mean, median
#from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
def main():
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
all_word_counts = []
all_word_len = []
for filename in os.listdir(instructions_dir):
instructions_metadata = {}
print(filename)
with open(instructions_dir + filename, "r") as file:
word_list = file.read().split()
word_count = len(word_list)
lemmatized_words = []
for word in word_list:
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
if lemma_word not in lemmatized_words:
lemmatized_words.append(lemma_word)
# pulling whether or not keywords like "Checklist" or "Process" occur?
# pulling whether "HOWTO" occurs
unique_word_count = len(word_list)
if "checklist" in lemmatized_words or "process" in lemmatized_words:
print('contains keyword')
print(word_count)
all_word_counts.append(unique_word_count)
doc_word_len = []
for word in word_list:
doc_word_len.append(len(word))
all_word_len.append(sum(doc_word_len)/len(doc_word_len))
print(sum(all_word_len)/len(all_word_len))
print(sum(all_word_counts)/len(all_word_counts))
def consolidate_csv():
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
all_word_counts = []
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
columns = list(total_underprod_csv.columns)
columns.append("inst_filepath")
total_count = 0
success_count = 0
with open("kk_final_doclist_roster.csv", 'w', newline='') as output_file:
writer = csv.writer(output_file, columns)
writer.writerow(columns)
for filename in os.listdir(instructions_dir):
total_count += 1
row_value = []
cleaning_files = "_inst.md"
pkg_name = filename[:-len(cleaning_files)]
print(pkg_name)
for item in list_of_links:
if pkg_name in item:
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
row_value.append(instructions_dir + filename)
print(row_value)
writer.writerow(row_value)
def consolidate_readme():
readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/"
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
columns = list(total_underprod_csv.columns)
columns.append("readme_filepath")
total_count = 0
success_count = 0
with open("kk_final_readme_roster.csv", 'w', newline='') as output_file:
writer = csv.writer(output_file, columns)
writer.writerow(columns)
for filename in os.listdir(readme_dir):
total_count += 1
row_value = []
cleaning_files = "_readme.md"
pkg_name = filename[:-len(cleaning_files)]
print(pkg_name)
for item in list_of_links:
if pkg_name in item:
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
row_value.append(readme_dir + filename)
print(row_value)
writer.writerow(row_value)
def consolidate_csv_2():
rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
columns = list(total_underprod_csv.columns)
columns.append("rost_filepath")
total_count=0
with open("kk_final_rosterslist.csv", 'w', newline='') as output_file:
writer = csv.writer(output_file, columns)
writer.writerow(columns)
for filename in os.listdir(rosters_dir):
total_count += 1
row_value = []
cleaning_files = "_contrib.json"
pkg_name = filename[:-len(cleaning_files)]
print(pkg_name)
for item in list_of_links:
if pkg_name in item:
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
row_value.append(rosters_dir + filename)
print(row_value)
writer.writerow(row_value)
def consolidate_csv_3():
rosters_dir = "/data/users/mgaughan/kkex_comment_data_121323/"
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
list_of_links = total_underprod_csv["project_name"].tolist()
columns = list(total_underprod_csv.columns)
columns.append("comments_filepath")
total_count=0
with open("kk_final_commentlist.csv", 'w', newline='') as output_file:
writer = csv.writer(output_file, columns)
writer.writerow(columns)
for filename in os.listdir(rosters_dir):
total_count += 1
row_value = []
cleaning_files = "gh_comments_"
pkg_name = filename[len(cleaning_files):-len(".json")]
print(pkg_name)
for item in list_of_links:
if pkg_name == item:
row_value = total_underprod_csv.loc[total_underprod_csv["project_name"] == item].values.tolist()[0]
row_value.append(rosters_dir + filename)
print(row_value)
writer.writerow(row_value)
def get_main_for_splice():
inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv")
inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean'])
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
all_word_counts = []
all_word_len = []
all_header_counts = []
actual_index = 0
for index, row in inst_doc_df.iterrows():
actual_index += 1
if actual_index < 700:
for filename in os.listdir(instructions_dir):
instructions_metadata = {}
if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]:
with open(instructions_dir + filename, "r") as file:
word_list = file.read().split()
word_count = len(word_list)
lemmatized_words = []
for word in word_list:
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
if lemma_word not in lemmatized_words:
lemmatized_words.append(lemma_word)
# pulling whether or not keywords like "Checklist" or "Process" occur?
# pulling whether "HOWTO" occurs
unique_word_count = len(word_list)
print(word_count)
all_word_counts.append(unique_word_count)
doc_word_len = []
header_count = 0
for word in word_list:
if "#" in word:
header_count += 1
doc_word_len.append(len(word))
print(header_count)
all_header_counts.append(header_count)
all_word_len.append(sum(doc_word_len)/len(doc_word_len))
#print(sum(all_word_len)/len(all_word_len))
#print(sum(all_word_counts)/len(all_word_counts))
print(mean(all_header_counts))
print(median(all_header_counts))
if __name__ == "__main__":
consolidate_csv_3()