diff --git a/cleaning_did_data.py b/cleaning_did_data.py index d097c1b..eb7a99d 100644 --- a/cleaning_did_data.py +++ b/cleaning_did_data.py @@ -2,11 +2,13 @@ import csv import pandas as pd import os +# the repo of files is the record of what projects we have the specific files for +# use the repo of files to then match back to the DiD data temp_dir = "/data/users/mgaughan/tmp3/" with open("final_readme_did.csv", "w") as writing_file: csv_writer = csv.writer(writing_file) - for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]: + for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/")]: file_project = "".join(filename.split("_")[:-1]) with open("kk_031624_pr_did.csv", "r") as file1: reader_obj = csv.reader(file1) @@ -22,6 +24,7 @@ with open("final_readme_did.csv", "w") as writing_file: project_name = temp_dir + line[0].split('/')[- 1] if file_project == project_name: csv_writer.writerow(line) + break ''' for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]: diff --git a/get_spec_file.py b/get_spec_file.py index f450538..010ba59 100644 --- a/get_spec_file.py +++ b/get_spec_file.py @@ -12,7 +12,7 @@ import math import io import re -working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing" +working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2" temp_dir = "/data/users/mgaughan/tmp3/" # getting the specific readme or contributing file from a given commit @@ -48,20 +48,7 @@ def get_file(vcs_link, commit_hash, is_readme): target_filename = file['file'].split("/")[-1] else: target_filename = file['file'] - #print(commit.tree) - #getting the name of the file from the root directory - ''' - target_filename = "" - for filename in os.listdir(full_temp_path): - if is_readme: - #target_filename = "README.md" - if "README" in filename or "readme" in filename: - target_filename = filename - else: - #target_filename = "CONTRIBUTING.md" - if "CONTRIBUTING" in filename or "contributing" in filename: - target_filename = filename - ''' + if target_filename == "": shutil.rmtree(full_temp_path, ignore_errors=True) shutil.rmtree(other_temp_path, ignore_errors=True) @@ -80,9 +67,8 @@ def get_file(vcs_link, commit_hash, is_readme): shutil.rmtree(full_temp_path, ignore_errors=True) shutil.rmtree(other_temp_path, ignore_errors=True) return "KeyError -- the file is not in the commit tree" - if is_readme: - last_path = "readme" + last_path = "readme2" else: last_path = "contributing" with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file: @@ -96,27 +82,31 @@ def get_file(vcs_link, commit_hash, is_readme): def for_all_files(): #toggle this based on readme or contributing files - readme_is = False - csv_path = "final_data/deb_contrib_did_data.csv" + readme_is = True + csv_path = "kk_031624_pr_did.csv" index = -1 with open(csv_path, 'r') as file: - with open('c_031824_spec_errors.csv', "w") as writing_file: + with open('d_031824_spec_errors.csv', "w") as writing_file: csv_writer = csv.writer(writing_file) - #csv_reader = csv.DictReader(file) - lines = [line for line in file] - for row in tqdm(csv.reader(lines), total=len(lines)): - index += 1 - if index == 0: - continue - if row[0] == "": - continue - #print(row[0]) - return_value = get_file(row[0], row[2], readme_is) - if return_value != "NoError": - csv_writer.writerow([row[0], row[2], readme_is, return_value]) - # if it is noError, just write the row down in a different csv - # there's an issue of duplicates, but just keep it moving - # if no duplicates -- just run it through + with open("readme_completed_downloads.csv", "w") as writing_file2: + csv_writer2 = csv.writer(writing_file2) + #csv_reader = csv.DictReader(file) + lines = [line for line in file] + for row in tqdm(csv.reader(lines), total=len(lines)): + index += 1 + if index == 0: + continue + if row[0] == "": + continue + #print(row[0]) + return_value = get_file(row[0], row[2], readme_is) + if return_value != "NoError": + csv_writer.writerow([row[0], row[2], readme_is, return_value]) + else: + csv_writer2.writerow(row) + # if it is noError, just write the row down in a different csv + # there's an issue of duplicates, but just keep it moving + # if no duplicates -- just run it through if __name__ == "__main__": for_all_files() \ No newline at end of file