backing up readme
This commit is contained in:
parent
8a348ed94e
commit
b548f2eab0
@ -9,24 +9,34 @@ key = os.environ.get('KKEXKEY')
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
wd = os.getcwd()
|
wd = os.getcwd()
|
||||||
with open("expanded_data_final.csv", "r") as file:
|
with open("kk_final_expanded_data_final.csv", "r") as file:
|
||||||
reader = csv.reader(file)
|
reader = csv.reader(file)
|
||||||
for i, line in enumerate(reader):
|
for i, line in enumerate(reader):
|
||||||
print("")
|
print("")
|
||||||
repo_name = line[5].strip().split("/")[-1]
|
repo_name = line[5].strip().split("/")[-1]
|
||||||
print("on " + repo_name)
|
print("on " + repo_name)
|
||||||
repo_url = str(line[5].strip())
|
repo_url = str(line[5].strip())
|
||||||
temp_dir = "/data/users/mgaughan/kkex_contrib_files_122023/tmp/" + repo_name
|
temp_dir = "/data/users/mgaughan/kkex_readme_021824/tmp" + repo_name
|
||||||
try:
|
try:
|
||||||
Repo.clone_from(repo_url, temp_dir)
|
Repo.clone_from(repo_url, temp_dir)
|
||||||
except:
|
except:
|
||||||
print("issue cloning")
|
print("issue cloning")
|
||||||
continue
|
continue
|
||||||
|
'''
|
||||||
if os.path.exists(temp_dir + "/.all-contributorsrc"):
|
if os.path.exists(temp_dir + "/.all-contributorsrc"):
|
||||||
shutil.copyfile(temp_dir + "/.all-contributorsrc", "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" + repo_name + "_contrib.json")
|
shutil.copyfile(temp_dir + "/.all-contributorsrc", "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" + repo_name + "_contrib.json")
|
||||||
if os.path.exists(temp_dir + "/CONTRIBUTING.md"):
|
if os.path.exists(temp_dir + "/CONTRIBUTING.md"):
|
||||||
shutil.copyfile(temp_dir + "/CONTRIBUTING.md", "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + repo_name + "_inst.md")
|
shutil.copyfile(temp_dir + "/CONTRIBUTING.md", "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + repo_name + "_inst.md")
|
||||||
|
'''
|
||||||
|
#cannot run yet, with api chokepoint
|
||||||
|
if os.path.exists(temp_dir + "/README.md"):
|
||||||
|
shutil.copyfile(temp_dir + "/README.md", "/data/users/mgaughan/kkex_files_022124/readme/" + repo_name + "_readme.md")
|
||||||
|
if os.path.exists(temp_dir + "/GOVERNANCE.md"):
|
||||||
|
shutil.copyfile(temp_dir + "/GOVERNANCE.md", "/data/users/mgaughan/kkex_files_022124/governance/" + repo_name + "_readme.md")
|
||||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
print('done')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
@ -2,6 +2,7 @@ import csv
|
|||||||
import os
|
import os
|
||||||
import nltk
|
import nltk
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from statistics import mean, median
|
||||||
#from nltk.stem import WordNetLemmatizer
|
#from nltk.stem import WordNetLemmatizer
|
||||||
|
|
||||||
nltk.download('wordnet')
|
nltk.download('wordnet')
|
||||||
@ -60,6 +61,30 @@ def consolidate_csv():
|
|||||||
print(row_value)
|
print(row_value)
|
||||||
writer.writerow(row_value)
|
writer.writerow(row_value)
|
||||||
|
|
||||||
|
def consolidate_readme():
|
||||||
|
readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/"
|
||||||
|
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
|
||||||
|
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
|
||||||
|
columns = list(total_underprod_csv.columns)
|
||||||
|
columns.append("readme_filepath")
|
||||||
|
total_count = 0
|
||||||
|
success_count = 0
|
||||||
|
with open("kk_final_readme_roster.csv", 'w', newline='') as output_file:
|
||||||
|
writer = csv.writer(output_file, columns)
|
||||||
|
writer.writerow(columns)
|
||||||
|
for filename in os.listdir(readme_dir):
|
||||||
|
total_count += 1
|
||||||
|
row_value = []
|
||||||
|
cleaning_files = "_readme.md"
|
||||||
|
pkg_name = filename[:-len(cleaning_files)]
|
||||||
|
print(pkg_name)
|
||||||
|
for item in list_of_links:
|
||||||
|
if pkg_name in item:
|
||||||
|
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
|
||||||
|
row_value.append(readme_dir + filename)
|
||||||
|
print(row_value)
|
||||||
|
writer.writerow(row_value)
|
||||||
|
|
||||||
def consolidate_csv_2():
|
def consolidate_csv_2():
|
||||||
rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"
|
rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/"
|
||||||
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
|
total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv")
|
||||||
@ -83,8 +108,46 @@ def consolidate_csv_2():
|
|||||||
print(row_value)
|
print(row_value)
|
||||||
writer.writerow(row_value)
|
writer.writerow(row_value)
|
||||||
|
|
||||||
|
def get_main_for_splice():
|
||||||
|
inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv")
|
||||||
|
inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean'])
|
||||||
|
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
|
||||||
|
all_word_counts = []
|
||||||
|
all_word_len = []
|
||||||
|
all_header_counts = []
|
||||||
|
actual_index = 0
|
||||||
|
for index, row in inst_doc_df.iterrows():
|
||||||
|
actual_index += 1
|
||||||
|
if actual_index < 700:
|
||||||
|
for filename in os.listdir(instructions_dir):
|
||||||
|
instructions_metadata = {}
|
||||||
|
if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]:
|
||||||
|
with open(instructions_dir + filename, "r") as file:
|
||||||
|
word_list = file.read().split()
|
||||||
|
word_count = len(word_list)
|
||||||
|
lemmatized_words = []
|
||||||
|
for word in word_list:
|
||||||
|
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
|
||||||
|
if lemma_word not in lemmatized_words:
|
||||||
|
lemmatized_words.append(lemma_word)
|
||||||
|
# pulling whether or not keywords like "Checklist" or "Process" occur?
|
||||||
|
# pulling whether "HOWTO" occurs
|
||||||
|
unique_word_count = len(word_list)
|
||||||
|
print(word_count)
|
||||||
|
all_word_counts.append(unique_word_count)
|
||||||
|
doc_word_len = []
|
||||||
|
header_count = 0
|
||||||
|
for word in word_list:
|
||||||
|
if "#" in word:
|
||||||
|
header_count += 1
|
||||||
|
doc_word_len.append(len(word))
|
||||||
|
print(header_count)
|
||||||
|
all_header_counts.append(header_count)
|
||||||
|
all_word_len.append(sum(doc_word_len)/len(doc_word_len))
|
||||||
|
#print(sum(all_word_len)/len(all_word_len))
|
||||||
|
#print(sum(all_word_counts)/len(all_word_counts))
|
||||||
|
print(mean(all_header_counts))
|
||||||
|
print(median(all_header_counts))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
consolidate_readme()
|
||||||
|
2695
kk_final_readme_roster.csv
Normal file
2695
kk_final_readme_roster.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user