backing up readme
This commit is contained in:
		
							parent
							
								
									8a348ed94e
								
							
						
					
					
						commit
						b548f2eab0
					
				| @ -9,24 +9,34 @@ key = os.environ.get('KKEXKEY') | |||||||
| 
 | 
 | ||||||
| def main(): | def main(): | ||||||
|     wd = os.getcwd() |     wd = os.getcwd() | ||||||
|     with open("expanded_data_final.csv", "r") as file: |     with open("kk_final_expanded_data_final.csv", "r") as file: | ||||||
|         reader = csv.reader(file) |         reader = csv.reader(file) | ||||||
|         for i, line in enumerate(reader): |         for i, line in enumerate(reader): | ||||||
|             print("") |             print("") | ||||||
|             repo_name = line[5].strip().split("/")[-1] |             repo_name = line[5].strip().split("/")[-1] | ||||||
|             print("on " + repo_name) |             print("on " + repo_name) | ||||||
|             repo_url = str(line[5].strip()) |             repo_url = str(line[5].strip()) | ||||||
|             temp_dir = "/data/users/mgaughan/kkex_contrib_files_122023/tmp/" + repo_name |             temp_dir = "/data/users/mgaughan/kkex_readme_021824/tmp" + repo_name | ||||||
|             try: |             try: | ||||||
|                 Repo.clone_from(repo_url, temp_dir) |                 Repo.clone_from(repo_url, temp_dir) | ||||||
|             except: |             except: | ||||||
|                 print("issue cloning") |                 print("issue cloning") | ||||||
|                 continue |                 continue | ||||||
|  |             ''' | ||||||
|             if os.path.exists(temp_dir + "/.all-contributorsrc"): |             if os.path.exists(temp_dir + "/.all-contributorsrc"): | ||||||
|                 shutil.copyfile(temp_dir + "/.all-contributorsrc", "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" + repo_name + "_contrib.json") |                 shutil.copyfile(temp_dir + "/.all-contributorsrc", "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" + repo_name + "_contrib.json") | ||||||
|             if os.path.exists(temp_dir + "/CONTRIBUTING.md"): |             if os.path.exists(temp_dir + "/CONTRIBUTING.md"): | ||||||
|                 shutil.copyfile(temp_dir + "/CONTRIBUTING.md", "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + repo_name + "_inst.md") |                 shutil.copyfile(temp_dir + "/CONTRIBUTING.md", "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + repo_name + "_inst.md") | ||||||
|  |             ''' | ||||||
|  |             #cannot run yet, with api chokepoint  | ||||||
|  |             if os.path.exists(temp_dir + "/README.md"): | ||||||
|  |                 shutil.copyfile(temp_dir + "/README.md", "/data/users/mgaughan/kkex_files_022124/readme/" + repo_name + "_readme.md") | ||||||
|  |             if os.path.exists(temp_dir + "/GOVERNANCE.md"): | ||||||
|  |                 shutil.copyfile(temp_dir + "/GOVERNANCE.md", "/data/users/mgaughan/kkex_files_022124/governance/" + repo_name + "_readme.md") | ||||||
|             shutil.rmtree(temp_dir, ignore_errors=True) |             shutil.rmtree(temp_dir, ignore_errors=True) | ||||||
|  |     print('done') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     main() |     main() | ||||||
| @ -2,6 +2,7 @@ import csv | |||||||
| import os  | import os  | ||||||
| import nltk | import nltk | ||||||
| import pandas as pd | import pandas as pd | ||||||
|  | from statistics import mean, median | ||||||
| #from nltk.stem import WordNetLemmatizer | #from nltk.stem import WordNetLemmatizer | ||||||
| 
 | 
 | ||||||
| nltk.download('wordnet') | nltk.download('wordnet') | ||||||
| @ -59,6 +60,30 @@ def consolidate_csv(): | |||||||
|             row_value.append(instructions_dir + filename) |             row_value.append(instructions_dir + filename) | ||||||
|             print(row_value) |             print(row_value) | ||||||
|             writer.writerow(row_value) |             writer.writerow(row_value) | ||||||
|  | 
 | ||||||
|  | def consolidate_readme(): | ||||||
|  |     readme_dir = "/data/users/mgaughan/kkex_files_022124/readme/" | ||||||
|  |     total_underprod_csv = pd.read_csv("kk_final_expanded_data_final.csv") | ||||||
|  |     list_of_links = total_underprod_csv["upstream_vcs_link"].tolist() | ||||||
|  |     columns = list(total_underprod_csv.columns) | ||||||
|  |     columns.append("readme_filepath") | ||||||
|  |     total_count = 0 | ||||||
|  |     success_count = 0  | ||||||
|  |     with open("kk_final_readme_roster.csv", 'w', newline='') as output_file: | ||||||
|  |         writer = csv.writer(output_file, columns) | ||||||
|  |         writer.writerow(columns) | ||||||
|  |         for filename in os.listdir(readme_dir): | ||||||
|  |             total_count += 1 | ||||||
|  |             row_value = [] | ||||||
|  |             cleaning_files = "_readme.md" | ||||||
|  |             pkg_name = filename[:-len(cleaning_files)] | ||||||
|  |             print(pkg_name) | ||||||
|  |             for item in list_of_links: | ||||||
|  |                 if pkg_name in item: | ||||||
|  |                     row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]  | ||||||
|  |             row_value.append(readme_dir + filename) | ||||||
|  |             print(row_value) | ||||||
|  |             writer.writerow(row_value) | ||||||
|          |          | ||||||
| def consolidate_csv_2(): | def consolidate_csv_2(): | ||||||
|     rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" |     rosters_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contributing_lists/" | ||||||
| @ -83,8 +108,46 @@ def consolidate_csv_2(): | |||||||
|             print(row_value) |             print(row_value) | ||||||
|             writer.writerow(row_value) |             writer.writerow(row_value) | ||||||
| 
 | 
 | ||||||
| 
 | def get_main_for_splice(): | ||||||
| 
 |     inst_doc_df = pd.read_csv("kk_final_doclist_roster.csv") | ||||||
|  |     inst_doc_df = inst_doc_df.sort_values(by=['underproduction_mean']) | ||||||
|  |     instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" | ||||||
|  |     all_word_counts = [] | ||||||
|  |     all_word_len = [] | ||||||
|  |     all_header_counts = [] | ||||||
|  |     actual_index = 0 | ||||||
|  |     for index, row in inst_doc_df.iterrows(): | ||||||
|  |         actual_index += 1 | ||||||
|  |         if actual_index < 700: | ||||||
|  |             for filename in os.listdir(instructions_dir): | ||||||
|  |                 instructions_metadata = {} | ||||||
|  |                 if row["upstream_vcs_link"].strip().split("/")[-1] == filename[:-len("_inst.md")]: | ||||||
|  |                     with open(instructions_dir + filename, "r") as file: | ||||||
|  |                         word_list = file.read().split() | ||||||
|  |                         word_count = len(word_list) | ||||||
|  |                         lemmatized_words = [] | ||||||
|  |                         for word in word_list: | ||||||
|  |                             lemma_word = nltk.WordNetLemmatizer().lemmatize(word) | ||||||
|  |                             if lemma_word not in lemmatized_words: | ||||||
|  |                                 lemmatized_words.append(lemma_word) | ||||||
|  |                         # pulling whether or not keywords like "Checklist" or "Process" occur? | ||||||
|  |                         # pulling whether "HOWTO" occurs | ||||||
|  |                         unique_word_count = len(word_list) | ||||||
|  |                         print(word_count) | ||||||
|  |                         all_word_counts.append(unique_word_count) | ||||||
|  |                         doc_word_len = [] | ||||||
|  |                         header_count = 0 | ||||||
|  |                         for word in word_list: | ||||||
|  |                             if "#" in word: | ||||||
|  |                                 header_count += 1 | ||||||
|  |                             doc_word_len.append(len(word)) | ||||||
|  |                         print(header_count) | ||||||
|  |                         all_header_counts.append(header_count) | ||||||
|  |                         all_word_len.append(sum(doc_word_len)/len(doc_word_len)) | ||||||
|  |     #print(sum(all_word_len)/len(all_word_len)) | ||||||
|  |     #print(sum(all_word_counts)/len(all_word_counts)) | ||||||
|  |     print(mean(all_header_counts)) | ||||||
|  |     print(median(all_header_counts)) | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     main() |     consolidate_readme() | ||||||
|  | |||||||
							
								
								
									
										2695
									
								
								kk_final_readme_roster.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2695
									
								
								kk_final_readme_roster.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Loading…
	
		Reference in New Issue
	
	Block a user