From 6d4f56abe663595beaa5fb05a0b69642700289f7 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Sun, 31 Mar 2024 16:38:56 -0500 Subject: [PATCH] backup, cleaning files --- cleaning_did_data.py | 35 +++++++++++++++++++++++++++++++++++ get_spec_file.py | 14 ++++++-------- 2 files changed, 41 insertions(+), 8 deletions(-) create mode 100644 cleaning_did_data.py diff --git a/cleaning_did_data.py b/cleaning_did_data.py new file mode 100644 index 0000000..d097c1b --- /dev/null +++ b/cleaning_did_data.py @@ -0,0 +1,35 @@ +import csv +import pandas as pd +import os + + +temp_dir = "/data/users/mgaughan/tmp3/" +with open("final_readme_did.csv", "w") as writing_file: + csv_writer = csv.writer(writing_file) + for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]: + file_project = "".join(filename.split("_")[:-1]) + with open("kk_031624_pr_did.csv", "r") as file1: + reader_obj = csv.reader(file1) + for line in reader_obj: + if line[0] == "": + continue + if "github" in line[0] or "gitlab" in line[0]: + #making an evaluation that sub branches aren't being used and that people would fork if needed + #this only looks at main + temp_vcs = "/".join(line[0].split("/")[0:5]) + project_name = temp_vcs.split('/')[4] + else: + project_name = temp_dir + line[0].split('/')[- 1] + if file_project == project_name: + csv_writer.writerow(line) + +''' +for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]: + file_project = "".join(filename.split("_")[:-1]) + for filename2 in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]: + file_project2 = "".join(filename2.split("_")[:-1]) + if filename != filename2 and file_project == file_project2: + os.remove("/data/users/mgaughan/kkex/time_specific_files/readme/readme/" + filename2) +''' + + \ No newline at end of file diff --git a/get_spec_file.py b/get_spec_file.py index 87767c9..6a889bc 100644 --- a/get_spec_file.py +++ b/get_spec_file.py @@ -12,7 +12,7 @@ import math import io import re -working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme" +working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing" temp_dir = "/data/users/mgaughan/tmp3/" # getting the specific readme or contributing file from a given commit @@ -44,12 +44,10 @@ def get_file(vcs_link, commit_hash, is_readme): target_filename = file['file'] else: if "CONTRIBUTING" in file['file']: - ''' if "/" in file['file']: target_filename = file['file'].split("/")[-1] else: - ''' - target_filename = str(file['file']) + target_filename = file['file'] #print(commit.tree) #getting the name of the file from the root directory ''' @@ -98,7 +96,7 @@ def get_file(vcs_link, commit_hash, is_readme): targetfile = "" for blob in commit0.tree.blobs: #print(type(blob.path)) - if "README" in blob.path: + if "CONTRIBUTING" in blob.path: targetfile = blob #print(blob.path) # why would a file not be in the commit tree? but would be in the directory? @@ -124,11 +122,11 @@ def get_file(vcs_link, commit_hash, is_readme): def for_all_files(): #toggle this based on readme or contributing files - readme_is = True - csv_path = "kk_031624_pr_did.csv" + readme_is = False + csv_path = "final_data/deb_contrib_did_data.csv" index = -1 with open(csv_path, 'r') as file: - with open('a_031824_spec_errors.csv', "w") as writing_file: + with open('c_031824_spec_errors.csv', "w") as writing_file: csv_writer = csv.writer(writing_file) #csv_reader = csv.DictReader(file) lines = [line for line in file]