From 6d4f56abe663595beaa5fb05a0b69642700289f7 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <gaughan@u.northwestern.edu>
Date: Sun, 31 Mar 2024 16:38:56 -0500
Subject: [PATCH] backup, cleaning files

---
 cleaning_did_data.py | 35 +++++++++++++++++++++++++++++++++++
 get_spec_file.py     | 14 ++++++--------
 2 files changed, 41 insertions(+), 8 deletions(-)
 create mode 100644 cleaning_did_data.py

diff --git a/cleaning_did_data.py b/cleaning_did_data.py
new file mode 100644
index 0000000..d097c1b
--- /dev/null
+++ b/cleaning_did_data.py
@@ -0,0 +1,35 @@
+import csv
+import pandas as pd
+import os
+
+
+temp_dir = "/data/users/mgaughan/tmp3/"
+with open("final_readme_did.csv", "w") as writing_file:
+        csv_writer = csv.writer(writing_file)
+        for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]:   
+            file_project = "".join(filename.split("_")[:-1])
+            with open("kk_031624_pr_did.csv", "r") as file1: 
+                reader_obj = csv.reader(file1)
+                for line in reader_obj:
+                    if line[0] == "":
+                        continue 
+                    if "github" in line[0] or "gitlab" in line[0]:
+                        #making an evaluation that sub branches aren't being used and that people would fork if needed
+                        #this only looks at main
+                        temp_vcs = "/".join(line[0].split("/")[0:5])
+                        project_name = temp_vcs.split('/')[4] 
+                    else:
+                        project_name = temp_dir + line[0].split('/')[- 1]
+                    if file_project == project_name:
+                        csv_writer.writerow(line)
+                        
+'''
+for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
+    file_project = "".join(filename.split("_")[:-1])
+    for filename2 in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
+        file_project2 = "".join(filename2.split("_")[:-1])
+        if filename != filename2 and file_project == file_project2:
+            os.remove("/data/users/mgaughan/kkex/time_specific_files/readme/readme/" + filename2)
+'''
+            
+                
\ No newline at end of file
diff --git a/get_spec_file.py b/get_spec_file.py
index 87767c9..6a889bc 100644
--- a/get_spec_file.py
+++ b/get_spec_file.py
@@ -12,7 +12,7 @@ import math
 import io 
 import re
 
-working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme"
+working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing"
 temp_dir = "/data/users/mgaughan/tmp3/"
 
 # getting the specific readme or contributing file from a given commit 
@@ -44,12 +44,10 @@ def get_file(vcs_link, commit_hash, is_readme):
                         target_filename = file['file']
             else:
                 if "CONTRIBUTING" in file['file']:
-                    '''
                     if "/" in file['file']:
                         target_filename = file['file'].split("/")[-1]
                     else:
-                    '''
-                    target_filename = str(file['file'])
+                        target_filename = file['file']
     #print(commit.tree)
     #getting the name of the file from the root directory
     '''
@@ -98,7 +96,7 @@ def get_file(vcs_link, commit_hash, is_readme):
     targetfile = ""
     for blob in commit0.tree.blobs:
         #print(type(blob.path))
-        if "README" in blob.path:
+        if "CONTRIBUTING" in blob.path:
             targetfile = blob
             #print(blob.path)
         # why would a file not be in the commit tree? but would be in the directory?
@@ -124,11 +122,11 @@ def get_file(vcs_link, commit_hash, is_readme):
 
 def for_all_files():
     #toggle this based on readme or contributing files
-    readme_is = True
-    csv_path = "kk_031624_pr_did.csv"
+    readme_is = False
+    csv_path = "final_data/deb_contrib_did_data.csv"
     index = -1
     with open(csv_path, 'r') as file:
-        with open('a_031824_spec_errors.csv', "w") as writing_file:
+        with open('c_031824_spec_errors.csv', "w") as writing_file:
             csv_writer = csv.writer(writing_file)
             #csv_reader = csv.DictReader(file)
             lines = [line for line in file]