24_deb_pkg_gov/instructions_analysis.py

import csv 
import os 
import nltk
import pandas as pd
from statistics import mean, median
#from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

def main():
    instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
    all_word_counts = []
    all_word_len = []
    for filename in os.listdir(instructions_dir):
        instructions_metadata = {}
        print(filename)
        with open(instructions_dir + filename, "r") as file:
            word_list = file.read().split()
            word_count = len(word_list)
            lemmatized_words = []
            for word in word_list:
                lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
                if lemma_word not in lemmatized_words:
                    lemmatized_words.append(lemma_word)
            # pulling whether or not keywords like "Checklist" or "Process" occur?
            # pulling whether "HOWTO" occurs
            unique_word_count = len(word_list)
            if "checklist" in lemmatized_words or "process" in lemmatized_words:
                print('contains keyword')
            print(word_count)
            all_word_counts.append(unique_word_count)
            doc_word_len = []
            for word in word_list:
                doc_word_len.append(len(word))
            all_word_len.append(sum(doc_word_len)/len(doc_word_len))
    print(sum(all_word_len)/len(all_word_len))
    print(sum(all_word_counts)/len(all_word_counts))

def consolidate_csv():
    instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
    all_word_counts = []
    total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
    list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
    columns = list(total_underprod_csv.columns)
    columns.append("inst_filepath")
    total_count = 0
    success_count = 0 
    with open("kk_final_doclist_roster.csv", 'w', newline='') as output_file:
        writer = csv.writer(output_file, columns)
        writer.writerow(columns)
        for filename in os.listdir(instructions_dir):
            total_count += 1
            row_value = []
            cleaning_files = "_inst.md"
            pkg_name = filename[:-len(cleaning_files)]
            print(pkg_name)
            for item in list_of_links:
                if pkg_name in item:
                    row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] 
            row_value.append(instructions_dir + filename)
            print(row_value)
            writer.writerow(row_value)

def consolidate_readme():
    readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/"
    total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
    list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
    columns = list(total_underprod_csv.columns)
    columns.append("readme_filepath")
    total_count = 0
    success_count = 0 
    with open("kk_final_readme_roster.csv", 'w', newline='') as output_file:
        writer = csv.writer(output_file, columns)
        writer.writerow(columns)
        for filename in os.listdir(readme_dir):
            total_count += 1
            row_value = []
            cleaning_files = "_readme.md"
            pkg_name = filename[:-len(cleaning_files)]
            print(pkg_name)
            for item in list_of_links:
                if pkg_name in item:
                    row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] 
            row_value.append(readme_dir + filename)
            print(row_value)
            writer.writerow(row_value)
        
def consolidate_csv_2():
    rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"
    total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
    list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
    columns = list(total_underprod_csv.columns)
    columns.append("rost_filepath")
    total_count=0
    with open("kk_final_rosterslist.csv", 'w', newline='') as output_file:
        writer = csv.writer(output_file, columns)
        writer.writerow(columns)
        for filename in os.listdir(rosters_dir):
            total_count += 1
            row_value = []
            cleaning_files = "_contrib.json"
            pkg_name = filename[:-len(cleaning_files)]
            print(pkg_name)
            for item in list_of_links:
                if pkg_name in item:
                    row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] 
            row_value.append(rosters_dir + filename)
            print(row_value)
            writer.writerow(row_value)

def consolidate_csv_3():
    rosters_dir = "/data/users/mgaughan/kkex_comment_data_121323/"
    total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
    list_of_links = total_underprod_csv["project_name"].tolist()
    columns = list(total_underprod_csv.columns)
    columns.append("comments_filepath")
    total_count=0
    with open("kk_final_commentlist.csv", 'w', newline='') as output_file:
        writer = csv.writer(output_file, columns)
        writer.writerow(columns)
        for filename in os.listdir(rosters_dir):
            total_count += 1
            row_value = []
            cleaning_files = "gh_comments_"
            pkg_name = filename[len(cleaning_files):-len(".json")]
            print(pkg_name)
            for item in list_of_links:
                if pkg_name == item:
                    row_value = total_underprod_csv.loc[total_underprod_csv["project_name"] == item].values.tolist()[0] 
            row_value.append(rosters_dir + filename)
            print(row_value)
            writer.writerow(row_value)

def get_main_for_splice():
    inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv")
    inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean'])
    instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
    all_word_counts = []
    all_word_len = []
    all_header_counts = []
    actual_index = 0
    for index, row in inst_doc_df.iterrows():
        actual_index += 1
        if actual_index < 700:
            for filename in os.listdir(instructions_dir):
                instructions_metadata = {}
                if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]:
                    with open(instructions_dir + filename, "r") as file:
                        word_list = file.read().split()
                        word_count = len(word_list)
                        lemmatized_words = []
                        for word in word_list:
                            lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
                            if lemma_word not in lemmatized_words:
                                lemmatized_words.append(lemma_word)
                        # pulling whether or not keywords like "Checklist" or "Process" occur?
                        # pulling whether "HOWTO" occurs
                        unique_word_count = len(word_list)
                        print(word_count)
                        all_word_counts.append(unique_word_count)
                        doc_word_len = []
                        header_count = 0
                        for word in word_list:
                            if "#" in word:
                                header_count += 1
                            doc_word_len.append(len(word))
                        print(header_count)
                        all_header_counts.append(header_count)
                        all_word_len.append(sum(doc_word_len)/len(doc_word_len))
    #print(sum(all_word_len)/len(all_word_len))
    #print(sum(all_word_counts)/len(all_word_counts))
    print(mean(all_header_counts))
    print(median(all_header_counts))

if __name__ == "__main__":
    consolidate_csv_3()
overdue back up 2024-01-15 17:55:54 +00:00			`import csv`
			`import os`
			`import nltk`
all data consolidated 2024-01-28 17:59:18 +00:00			`import pandas as pd`
backing up readme 2024-02-23 16:25:13 +00:00			`from statistics import mean, median`
overdue back up 2024-01-15 17:55:54 +00:00			`#from nltk.stem import WordNetLemmatizer`

			`nltk.download('wordnet')`

			`def main():`
			`instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"`
backup for expanded contributor data 2024-01-22 17:20:52 +00:00			`all_word_counts = []`
backing up 2024-02-18 21:45:31 +00:00			`all_word_len = []`
overdue back up 2024-01-15 17:55:54 +00:00			`for filename in os.listdir(instructions_dir):`
			`instructions_metadata = {}`
			`print(filename)`
			`with open(instructions_dir + filename, "r") as file:`
			`word_list = file.read().split()`
			`word_count = len(word_list)`
			`lemmatized_words = []`
			`for word in word_list:`
			`lemma_word = nltk.WordNetLemmatizer().lemmatize(word)`
			`if lemma_word not in lemmatized_words:`
			`lemmatized_words.append(lemma_word)`
			`# pulling whether or not keywords like "Checklist" or "Process" occur?`
backing up 2024-02-18 21:45:31 +00:00			`# pulling whether "HOWTO" occurs`
			`unique_word_count = len(word_list)`
backup for expanded contributor data 2024-01-22 17:20:52 +00:00			`if "checklist" in lemmatized_words or "process" in lemmatized_words:`
			`print('contains keyword')`
overdue back up 2024-01-15 17:55:54 +00:00			`print(word_count)`
backup for expanded contributor data 2024-01-22 17:20:52 +00:00			`all_word_counts.append(unique_word_count)`
backing up 2024-02-18 21:45:31 +00:00			`doc_word_len = []`
			`for word in word_list:`
			`doc_word_len.append(len(word))`
			`all_word_len.append(sum(doc_word_len)/len(doc_word_len))`
			`print(sum(all_word_len)/len(all_word_len))`
backup for expanded contributor data 2024-01-22 17:20:52 +00:00			`print(sum(all_word_counts)/len(all_word_counts))`
overdue back up 2024-01-15 17:55:54 +00:00
all data consolidated 2024-01-28 17:59:18 +00:00			`def consolidate_csv():`
			`instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"`
			`all_word_counts = []`
			`total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")`
			`list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()`
			`columns = list(total_underprod_csv.columns)`
			`columns.append("inst_filepath")`
			`total_count = 0`
			`success_count = 0`
			`with open("kk_final_doclist_roster.csv", 'w', newline='') as output_file:`
			`writer = csv.writer(output_file, columns)`
			`writer.writerow(columns)`
			`for filename in os.listdir(instructions_dir):`
			`total_count += 1`
			`row_value = []`
			`cleaning_files = "_inst.md"`
			`pkg_name = filename[:-len(cleaning_files)]`
			`print(pkg_name)`
			`for item in list_of_links:`
			`if pkg_name in item:`
			`row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]`
			`row_value.append(instructions_dir + filename)`
			`print(row_value)`
			`writer.writerow(row_value)`
backing up readme 2024-02-23 16:25:13 +00:00
			`def consolidate_readme():`
			`readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/"`
			`total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")`
			`list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()`
			`columns = list(total_underprod_csv.columns)`
			`columns.append("readme_filepath")`
			`total_count = 0`
			`success_count = 0`
			`with open("kk_final_readme_roster.csv", 'w', newline='') as output_file:`
			`writer = csv.writer(output_file, columns)`
			`writer.writerow(columns)`
			`for filename in os.listdir(readme_dir):`
			`total_count += 1`
			`row_value = []`
			`cleaning_files = "_readme.md"`
			`pkg_name = filename[:-len(cleaning_files)]`
			`print(pkg_name)`
			`for item in list_of_links:`
			`if pkg_name in item:`
			`row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]`
			`row_value.append(readme_dir + filename)`
			`print(row_value)`
			`writer.writerow(row_value)`
all data consolidated 2024-01-28 17:59:18 +00:00
			`def consolidate_csv_2():`
			`rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"`
			`total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")`
			`list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()`
			`columns = list(total_underprod_csv.columns)`
			`columns.append("rost_filepath")`
			`total_count=0`
			`with open("kk_final_rosterslist.csv", 'w', newline='') as output_file:`
			`writer = csv.writer(output_file, columns)`
			`writer.writerow(columns)`
			`for filename in os.listdir(rosters_dir):`
			`total_count += 1`
			`row_value = []`
			`cleaning_files = "_contrib.json"`
			`pkg_name = filename[:-len(cleaning_files)]`
			`print(pkg_name)`
			`for item in list_of_links:`
			`if pkg_name in item:`
			`row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]`
			`row_value.append(rosters_dir + filename)`
			`print(row_value)`
			`writer.writerow(row_value)`

consolidate comment_data 2024-03-04 02:34:58 +00:00			`def consolidate_csv_3():`
			`rosters_dir = "/data/users/mgaughan/kkex_comment_data_121323/"`
			`total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")`
			`list_of_links = total_underprod_csv["project_name"].tolist()`
			`columns = list(total_underprod_csv.columns)`
			`columns.append("comments_filepath")`
			`total_count=0`
			`with open("kk_final_commentlist.csv", 'w', newline='') as output_file:`
			`writer = csv.writer(output_file, columns)`
			`writer.writerow(columns)`
			`for filename in os.listdir(rosters_dir):`
			`total_count += 1`
			`row_value = []`
			`cleaning_files = "gh_comments_"`
			`pkg_name = filename[len(cleaning_files):-len(".json")]`
			`print(pkg_name)`
			`for item in list_of_links:`
			`if pkg_name == item:`
			`row_value = total_underprod_csv.loc[total_underprod_csv["project_name"] == item].values.tolist()[0]`
			`row_value.append(rosters_dir + filename)`
			`print(row_value)`
			`writer.writerow(row_value)`

backing up readme 2024-02-23 16:25:13 +00:00			`def get_main_for_splice():`
			`inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv")`
			`inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean'])`
			`instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"`
			`all_word_counts = []`
			`all_word_len = []`
			`all_header_counts = []`
			`actual_index = 0`
			`for index, row in inst_doc_df.iterrows():`
			`actual_index += 1`
			`if actual_index < 700:`
			`for filename in os.listdir(instructions_dir):`
			`instructions_metadata = {}`
			`if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]:`
			`with open(instructions_dir + filename, "r") as file:`
			`word_list = file.read().split()`
			`word_count = len(word_list)`
			`lemmatized_words = []`
			`for word in word_list:`
			`lemma_word = nltk.WordNetLemmatizer().lemmatize(word)`
			`if lemma_word not in lemmatized_words:`
			`lemmatized_words.append(lemma_word)`
			`# pulling whether or not keywords like "Checklist" or "Process" occur?`
			`# pulling whether "HOWTO" occurs`
			`unique_word_count = len(word_list)`
			`print(word_count)`
			`all_word_counts.append(unique_word_count)`
			`doc_word_len = []`
			`header_count = 0`
			`for word in word_list:`
			`if "#" in word:`
			`header_count += 1`
			`doc_word_len.append(len(word))`
			`print(header_count)`
			`all_header_counts.append(header_count)`
			`all_word_len.append(sum(doc_word_len)/len(doc_word_len))`
			`#print(sum(all_word_len)/len(all_word_len))`
			`#print(sum(all_word_counts)/len(all_word_counts))`
			`print(mean(all_header_counts))`
			`print(median(all_header_counts))`
overdue back up 2024-01-15 17:55:54 +00:00
			`if __name__ == "__main__":`
consolidate comment_data 2024-03-04 02:34:58 +00:00			`consolidate_csv_3()`