backing up readme

2024-02-23 10:25:13 -06:00 · 2024-02-23 10:25:13 -06:00 · b548f2eab0
commit b548f2eab0
parent 8a348ed94e
3 changed files with 2773 additions and 5 deletions
--- a/check_contrib_file.py
+++ b/check_contrib_file.py
@ -9,24 +9,34 @@ key = os.environ.get('KKEXKEY')

 def main():
    wd = os.getcwd()
-    with open("expanded_data_final.csv", "r") as file:
+    with open("kk_final_expanded_data_final.csv", "r") as file:
        reader = csv.reader(file)
        for i, line in enumerate(reader):
            print("")
            repo_name = line[5].strip().split("/")[-1]
            print("on " + repo_name)
            repo_url = str(line[5].strip())
-            temp_dir = "/data/users/mgaughan/kkex_contrib_files_122023/tmp/" + repo_name
+            temp_dir = "/data/users/mgaughan/kkex_readme_021824/tmp" + repo_name
            try:
                Repo.clone_from(repo_url, temp_dir)
            except:
                print("issue cloning")
                continue
+            '''
            if os.path.exists(temp_dir + "/.all-contributorsrc"):
                shutil.copyfile(temp_dir + "/.all-contributorsrc", "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" + repo_name + "_contrib.json")
            if os.path.exists(temp_dir + "/CONTRIBUTING.md"):
                shutil.copyfile(temp_dir + "/CONTRIBUTING.md", "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + repo_name + "_inst.md")
+            '''
+            #cannot run yet, with api chokepoint 
+            if os.path.exists(temp_dir + "/README.md"):
+                shutil.copyfile(temp_dir + "/README.md", "/data/users/mgaughan/kkex_files_022124/readme/" + repo_name + "_readme.md")
+            if os.path.exists(temp_dir + "/GOVERNANCE.md"):
+                shutil.copyfile(temp_dir + "/GOVERNANCE.md", "/data/users/mgaughan/kkex_files_022124/governance/" + repo_name + "_readme.md")
            shutil.rmtree(temp_dir, ignore_errors=True)
+    print('done')
+
+

 if __name__ == "__main__":
    main()
--- a/instructions_analysis.py
+++ b/instructions_analysis.py
@ -2,6 +2,7 @@ import csv
 import os 
 import nltk
 import pandas as pd
+from statistics import mean, median
 #from nltk.stem import WordNetLemmatizer

 nltk.download('wordnet')
@ -59,6 +60,30 @@ def consolidate_csv():
            row_value.append(instructions_dir + filename)
            print(row_value)
            writer.writerow(row_value)
+
+def consolidate_readme():
+    readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/"
+    total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
+    list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
+    columns = list(total_underprod_csv.columns)
+    columns.append("readme_filepath")
+    total_count = 0
+    success_count = 0 
+    with open("kk_final_readme_roster.csv", 'w', newline='') as output_file:
+        writer = csv.writer(output_file, columns)
+        writer.writerow(columns)
+        for filename in os.listdir(readme_dir):
+            total_count += 1
+            row_value = []
+            cleaning_files = "_readme.md"
+            pkg_name = filename[:-len(cleaning_files)]
+            print(pkg_name)
+            for item in list_of_links:
+                if pkg_name in item:
+                    row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0] 
+            row_value.append(readme_dir + filename)
+            print(row_value)
+            writer.writerow(row_value)
        
 def consolidate_csv_2():
    rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"
@ -83,8 +108,46 @@ def consolidate_csv_2():
            print(row_value)
            writer.writerow(row_value)

-
-
+def get_main_for_splice():
+    inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv")
+    inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean'])
+    instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
+    all_word_counts = []
+    all_word_len = []
+    all_header_counts = []
+    actual_index = 0
+    for index, row in inst_doc_df.iterrows():
+        actual_index += 1
+        if actual_index < 700:
+            for filename in os.listdir(instructions_dir):
+                instructions_metadata = {}
+                if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]:
+                    with open(instructions_dir + filename, "r") as file:
+                        word_list = file.read().split()
+                        word_count = len(word_list)
+                        lemmatized_words = []
+                        for word in word_list:
+                            lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
+                            if lemma_word not in lemmatized_words:
+                                lemmatized_words.append(lemma_word)
+                        # pulling whether or not keywords like "Checklist" or "Process" occur?
+                        # pulling whether "HOWTO" occurs
+                        unique_word_count = len(word_list)
+                        print(word_count)
+                        all_word_counts.append(unique_word_count)
+                        doc_word_len = []
+                        header_count = 0
+                        for word in word_list:
+                            if "#" in word:
+                                header_count += 1
+                            doc_word_len.append(len(word))
+                        print(header_count)
+                        all_header_counts.append(header_count)
+                        all_word_len.append(sum(doc_word_len)/len(doc_word_len))
+    #print(sum(all_word_len)/len(all_word_len))
+    #print(sum(all_word_counts)/len(all_word_counts))
+    print(mean(all_header_counts))
+    print(median(all_header_counts))

 if __name__ == "__main__":
-    main()
+    consolidate_readme()
--- a/kk_final_readme_roster.csv
+++ b/kk_final_readme_roster.csv