backing up readme

This commit is contained in:
Matthew Gaughan 2024-02-23 10:25:13 -06:00
parent 8a348ed94e
commit b548f2eab0
3 changed files with 2773 additions and 5 deletions

View File

@ -9,24 +9,34 @@ key = os.environ.get('KKEXKEY')
def main(): def main():
wd = os.getcwd() wd = os.getcwd()
with open("expanded_data_final.csv", "r") as file: with open("kk_final_expanded_data_final.csv", "r") as file:
reader = csv.reader(file) reader = csv.reader(file)
for i, line in enumerate(reader): for i, line in enumerate(reader):
print("") print("")
repo_name = line[5].strip().split("/")[-1] repo_name = line[5].strip().split("/")[-1]
print("on " + repo_name) print("on " + repo_name)
repo_url = str(line[5].strip()) repo_url = str(line[5].strip())
temp_dir = "/data/users/mgaughan/kkex_contrib_files_122023/tmp/" + repo_name temp_dir = "/data/users/mgaughan/kkex_readme_021824/tmp" + repo_name
try: try:
Repo.clone_from(repo_url, temp_dir) Repo.clone_from(repo_url, temp_dir)
except: except:
print("issue cloning") print("issue cloning")
continue continue
'''
if os.path.exists(temp_dir + "/.all-contributorsrc"): if os.path.exists(temp_dir + "/.all-contributorsrc"):
shutil.copyfile(temp_dir + "/.all-contributorsrc", "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" + repo_name + "_contrib.json") shutil.copyfile(temp_dir + "/.all-contributorsrc", "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" + repo_name + "_contrib.json")
if os.path.exists(temp_dir + "/CONTRIBUTING.md"): if os.path.exists(temp_dir + "/CONTRIBUTING.md"):
shutil.copyfile(temp_dir + "/CONTRIBUTING.md", "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + repo_name + "_inst.md") shutil.copyfile(temp_dir + "/CONTRIBUTING.md", "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + repo_name + "_inst.md")
'''
#cannot run yet, with api chokepoint
if os.path.exists(temp_dir + "/README.md"):
shutil.copyfile(temp_dir + "/README.md", "/data/users/mgaughan/kkex_files_022124/readme/" + repo_name + "_readme.md")
if os.path.exists(temp_dir + "/GOVERNANCE.md"):
shutil.copyfile(temp_dir + "/GOVERNANCE.md", "/data/users/mgaughan/kkex_files_022124/governance/" + repo_name + "_readme.md")
shutil.rmtree(temp_dir, ignore_errors=True) shutil.rmtree(temp_dir, ignore_errors=True)
print('done')
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -2,6 +2,7 @@ import csv
import os import os
import nltk import nltk
import pandas as pd import pandas as pd
from statistics import mean, median
#from nltk.stem import WordNetLemmatizer #from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') nltk.download('wordnet')
@ -59,6 +60,30 @@ def consolidate_csv():
row_value.append(instructions_dir + filename) row_value.append(instructions_dir + filename)
print(row_value) print(row_value)
writer.writerow(row_value) writer.writerow(row_value)
def consolidate_readme():
readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/"
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
columns = list(total_underprod_csv.columns)
columns.append("readme_filepath")
total_count = 0
success_count = 0
with open("kk_final_readme_roster.csv", 'w', newline='') as output_file:
writer = csv.writer(output_file, columns)
writer.writerow(columns)
for filename in os.listdir(readme_dir):
total_count += 1
row_value = []
cleaning_files = "_readme.md"
pkg_name = filename[:-len(cleaning_files)]
print(pkg_name)
for item in list_of_links:
if pkg_name in item:
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
row_value.append(readme_dir + filename)
print(row_value)
writer.writerow(row_value)
def consolidate_csv_2(): def consolidate_csv_2():
rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"
@ -83,8 +108,46 @@ def consolidate_csv_2():
print(row_value) print(row_value)
writer.writerow(row_value) writer.writerow(row_value)
def get_main_for_splice():
inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv")
inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean'])
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
all_word_counts = []
all_word_len = []
all_header_counts = []
actual_index = 0
for index, row in inst_doc_df.iterrows():
actual_index += 1
if actual_index < 700:
for filename in os.listdir(instructions_dir):
instructions_metadata = {}
if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]:
with open(instructions_dir + filename, "r") as file:
word_list = file.read().split()
word_count = len(word_list)
lemmatized_words = []
for word in word_list:
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
if lemma_word not in lemmatized_words:
lemmatized_words.append(lemma_word)
# pulling whether or not keywords like "Checklist" or "Process" occur?
# pulling whether "HOWTO" occurs
unique_word_count = len(word_list)
print(word_count)
all_word_counts.append(unique_word_count)
doc_word_len = []
header_count = 0
for word in word_list:
if "#" in word:
header_count += 1
doc_word_len.append(len(word))
print(header_count)
all_header_counts.append(header_count)
all_word_len.append(sum(doc_word_len)/len(doc_word_len))
#print(sum(all_word_len)/len(all_word_len))
#print(sum(all_word_counts)/len(all_word_counts))
print(mean(all_header_counts))
print(median(all_header_counts))
if __name__ == "__main__": if __name__ == "__main__":
main() consolidate_readme()

2695
kk_final_readme_roster.csv Normal file

File diff suppressed because it is too large Load Diff