backing up readme
This commit is contained in:
parent
8a348ed94e
commit
b548f2eab0
@ -9,24 +9,34 @@ key = os.environ.get('KKEXKEY')
|
||||
|
||||
def main():
|
||||
wd = os.getcwd()
|
||||
with open("expanded_data_final.csv", "r") as file:
|
||||
with open("kk_final_expanded_data_final.csv", "r") as file:
|
||||
reader = csv.reader(file)
|
||||
for i, line in enumerate(reader):
|
||||
print("")
|
||||
repo_name = line[5].strip().split("/")[-1]
|
||||
print("on " + repo_name)
|
||||
repo_url = str(line[5].strip())
|
||||
temp_dir = "/data/users/mgaughan/kkex_contrib_files_122023/tmp/" + repo_name
|
||||
temp_dir = "/data/users/mgaughan/kkex_readme_021824/tmp" + repo_name
|
||||
try:
|
||||
Repo.clone_from(repo_url, temp_dir)
|
||||
except:
|
||||
print("issue cloning")
|
||||
continue
|
||||
'''
|
||||
if os.path.exists(temp_dir + "/.all-contributorsrc"):
|
||||
shutil.copyfile(temp_dir + "/.all-contributorsrc", "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" + repo_name + "_contrib.json")
|
||||
if os.path.exists(temp_dir + "/CONTRIBUTING.md"):
|
||||
shutil.copyfile(temp_dir + "/CONTRIBUTING.md", "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + repo_name + "_inst.md")
|
||||
'''
|
||||
#cannot run yet, with api chokepoint
|
||||
if os.path.exists(temp_dir + "/README.md"):
|
||||
shutil.copyfile(temp_dir + "/README.md", "/data/users/mgaughan/kkex_files_022124/readme/" + repo_name + "_readme.md")
|
||||
if os.path.exists(temp_dir + "/GOVERNANCE.md"):
|
||||
shutil.copyfile(temp_dir + "/GOVERNANCE.md", "/data/users/mgaughan/kkex_files_022124/governance/" + repo_name + "_readme.md")
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
print('done')
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -2,6 +2,7 @@ import csv
|
||||
import os
|
||||
import nltk
|
||||
import pandas as pd
|
||||
from statistics import mean, median
|
||||
#from nltk.stem import WordNetLemmatizer
|
||||
|
||||
nltk.download('wordnet')
|
||||
@ -59,6 +60,30 @@ def consolidate_csv():
|
||||
row_value.append(instructions_dir + filename)
|
||||
print(row_value)
|
||||
writer.writerow(row_value)
|
||||
|
||||
def consolidate_readme():
|
||||
readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/"
|
||||
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
|
||||
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
|
||||
columns = list(total_underprod_csv.columns)
|
||||
columns.append("readme_filepath")
|
||||
total_count = 0
|
||||
success_count = 0
|
||||
with open("kk_final_readme_roster.csv", 'w', newline='') as output_file:
|
||||
writer = csv.writer(output_file, columns)
|
||||
writer.writerow(columns)
|
||||
for filename in os.listdir(readme_dir):
|
||||
total_count += 1
|
||||
row_value = []
|
||||
cleaning_files = "_readme.md"
|
||||
pkg_name = filename[:-len(cleaning_files)]
|
||||
print(pkg_name)
|
||||
for item in list_of_links:
|
||||
if pkg_name in item:
|
||||
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
|
||||
row_value.append(readme_dir + filename)
|
||||
print(row_value)
|
||||
writer.writerow(row_value)
|
||||
|
||||
def consolidate_csv_2():
|
||||
rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"
|
||||
@ -83,8 +108,46 @@ def consolidate_csv_2():
|
||||
print(row_value)
|
||||
writer.writerow(row_value)
|
||||
|
||||
|
||||
|
||||
def get_main_for_splice():
|
||||
inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv")
|
||||
inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean'])
|
||||
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
|
||||
all_word_counts = []
|
||||
all_word_len = []
|
||||
all_header_counts = []
|
||||
actual_index = 0
|
||||
for index, row in inst_doc_df.iterrows():
|
||||
actual_index += 1
|
||||
if actual_index < 700:
|
||||
for filename in os.listdir(instructions_dir):
|
||||
instructions_metadata = {}
|
||||
if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]:
|
||||
with open(instructions_dir + filename, "r") as file:
|
||||
word_list = file.read().split()
|
||||
word_count = len(word_list)
|
||||
lemmatized_words = []
|
||||
for word in word_list:
|
||||
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
|
||||
if lemma_word not in lemmatized_words:
|
||||
lemmatized_words.append(lemma_word)
|
||||
# pulling whether or not keywords like "Checklist" or "Process" occur?
|
||||
# pulling whether "HOWTO" occurs
|
||||
unique_word_count = len(word_list)
|
||||
print(word_count)
|
||||
all_word_counts.append(unique_word_count)
|
||||
doc_word_len = []
|
||||
header_count = 0
|
||||
for word in word_list:
|
||||
if "#" in word:
|
||||
header_count += 1
|
||||
doc_word_len.append(len(word))
|
||||
print(header_count)
|
||||
all_header_counts.append(header_count)
|
||||
all_word_len.append(sum(doc_word_len)/len(doc_word_len))
|
||||
#print(sum(all_word_len)/len(all_word_len))
|
||||
#print(sum(all_word_counts)/len(all_word_counts))
|
||||
print(mean(all_header_counts))
|
||||
print(median(all_header_counts))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
consolidate_readme()
|
||||
|
2695
kk_final_readme_roster.csv
Normal file
2695
kk_final_readme_roster.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user