backup, cleaning files
This commit is contained in:
		
							parent
							
								
									6e822bf64b
								
							
						
					
					
						commit
						6d4f56abe6
					
				
							
								
								
									
										35
									
								
								cleaning_did_data.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								cleaning_did_data.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,35 @@ | |||||||
|  | import csv | ||||||
|  | import pandas as pd | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | temp_dir = "/data/users/mgaughan/tmp3/" | ||||||
|  | with open("final_readme_did.csv", "w") as writing_file: | ||||||
|  |         csv_writer = csv.writer(writing_file) | ||||||
|  |         for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]:    | ||||||
|  |             file_project = "".join(filename.split("_")[:-1]) | ||||||
|  |             with open("kk_031624_pr_did.csv", "r") as file1:  | ||||||
|  |                 reader_obj = csv.reader(file1) | ||||||
|  |                 for line in reader_obj: | ||||||
|  |                     if line[0] == "": | ||||||
|  |                         continue  | ||||||
|  |                     if "github" in line[0] or "gitlab" in line[0]: | ||||||
|  |                         #making an evaluation that sub branches aren't being used and that people would fork if needed | ||||||
|  |                         #this only looks at main | ||||||
|  |                         temp_vcs = "/".join(line[0].split("/")[0:5]) | ||||||
|  |                         project_name = temp_vcs.split('/')[4]  | ||||||
|  |                     else: | ||||||
|  |                         project_name = temp_dir + line[0].split('/')[- 1] | ||||||
|  |                     if file_project == project_name: | ||||||
|  |                         csv_writer.writerow(line) | ||||||
|  |                          | ||||||
|  | ''' | ||||||
|  | for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]: | ||||||
|  |     file_project = "".join(filename.split("_")[:-1]) | ||||||
|  |     for filename2 in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]: | ||||||
|  |         file_project2 = "".join(filename2.split("_")[:-1]) | ||||||
|  |         if filename != filename2 and file_project == file_project2: | ||||||
|  |             os.remove("/data/users/mgaughan/kkex/time_specific_files/readme/readme/" + filename2) | ||||||
|  | ''' | ||||||
|  |              | ||||||
|  |                  | ||||||
| @ -12,7 +12,7 @@ import math | |||||||
| import io  | import io  | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme" | working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing" | ||||||
| temp_dir = "/data/users/mgaughan/tmp3/" | temp_dir = "/data/users/mgaughan/tmp3/" | ||||||
| 
 | 
 | ||||||
| # getting the specific readme or contributing file from a given commit  | # getting the specific readme or contributing file from a given commit  | ||||||
| @ -44,12 +44,10 @@ def get_file(vcs_link, commit_hash, is_readme): | |||||||
|                         target_filename = file['file'] |                         target_filename = file['file'] | ||||||
|             else: |             else: | ||||||
|                 if "CONTRIBUTING" in file['file']: |                 if "CONTRIBUTING" in file['file']: | ||||||
|                     ''' |  | ||||||
|                     if "/" in file['file']: |                     if "/" in file['file']: | ||||||
|                         target_filename = file['file'].split("/")[-1] |                         target_filename = file['file'].split("/")[-1] | ||||||
|                     else: |                     else: | ||||||
|                     ''' |                         target_filename = file['file'] | ||||||
|                     target_filename = str(file['file']) |  | ||||||
|     #print(commit.tree) |     #print(commit.tree) | ||||||
|     #getting the name of the file from the root directory |     #getting the name of the file from the root directory | ||||||
|     ''' |     ''' | ||||||
| @ -98,7 +96,7 @@ def get_file(vcs_link, commit_hash, is_readme): | |||||||
|     targetfile = "" |     targetfile = "" | ||||||
|     for blob in commit0.tree.blobs: |     for blob in commit0.tree.blobs: | ||||||
|         #print(type(blob.path)) |         #print(type(blob.path)) | ||||||
|         if "README" in blob.path: |         if "CONTRIBUTING" in blob.path: | ||||||
|             targetfile = blob |             targetfile = blob | ||||||
|             #print(blob.path) |             #print(blob.path) | ||||||
|         # why would a file not be in the commit tree? but would be in the directory? |         # why would a file not be in the commit tree? but would be in the directory? | ||||||
| @ -124,11 +122,11 @@ def get_file(vcs_link, commit_hash, is_readme): | |||||||
| 
 | 
 | ||||||
| def for_all_files(): | def for_all_files(): | ||||||
|     #toggle this based on readme or contributing files |     #toggle this based on readme or contributing files | ||||||
|     readme_is = True |     readme_is = False | ||||||
|     csv_path = "kk_031624_pr_did.csv" |     csv_path = "final_data/deb_contrib_did_data.csv" | ||||||
|     index = -1 |     index = -1 | ||||||
|     with open(csv_path, 'r') as file: |     with open(csv_path, 'r') as file: | ||||||
|         with open('a_031824_spec_errors.csv', "w") as writing_file: |         with open('c_031824_spec_errors.csv', "w") as writing_file: | ||||||
|             csv_writer = csv.writer(writing_file) |             csv_writer = csv.writer(writing_file) | ||||||
|             #csv_reader = csv.DictReader(file) |             #csv_reader = csv.DictReader(file) | ||||||
|             lines = [line for line in file] |             lines = [line for line in file] | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user