backup, cleaning files

2024-03-31 16:38:56 -05:00 · 2024-03-31 16:38:56 -05:00 · 6d4f56abe6
commit 6d4f56abe6
parent 6e822bf64b
2 changed files with 41 additions and 8 deletions
--- a/cleaning_did_data.py
+++ b/cleaning_did_data.py
@ -0,0 +1,35 @@
 import csv
 import pandas as pd
 import os
 temp_dir = "/data/users/mgaughan/tmp3/"
 with open("final_readme_did.csv", "w") as writing_file:
        csv_writer = csv.writer(writing_file)
        for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]:   
            file_project = "".join(filename.split("_")[:-1])
            with open("kk_031624_pr_did.csv", "r") as file1: 
                reader_obj = csv.reader(file1)
                for line in reader_obj:
                    if line[0] == "":
                        continue 
                    if "github" in line[0] or "gitlab" in line[0]:
                        #making an evaluation that sub branches aren't being used and that people would fork if needed
                        #this only looks at main
                        temp_vcs = "/".join(line[0].split("/")[0:5])
                        project_name = temp_vcs.split('/')[4] 
                    else:
                        project_name = temp_dir + line[0].split('/')[- 1]
                    if file_project == project_name:
                        csv_writer.writerow(line)
 '''
 for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
    file_project = "".join(filename.split("_")[:-1])
    for filename2 in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
        file_project2 = "".join(filename2.split("_")[:-1])
        if filename != filename2 and file_project == file_project2:
            os.remove("/data/users/mgaughan/kkex/time_specific_files/readme/readme/" + filename2)
 '''
--- a/get_spec_file.py
+++ b/get_spec_file.py
@ -12,7 +12,7 @@ import math
 import io 
 import re
-working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme"
+working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing"
 temp_dir = "/data/users/mgaughan/tmp3/"
 # getting the specific readme or contributing file from a given commit 
@ -44,12 +44,10 @@ def get_file(vcs_link, commit_hash, is_readme):
                        target_filename = file['file']
            else:
                if "CONTRIBUTING" in file['file']:
                    '''
                    if "/" in file['file']:
                        target_filename = file['file'].split("/")[-1]
                    else:
-                    '''
+                        target_filename = file['file']
                    target_filename = str(file['file'])
    #print(commit.tree)
    #getting the name of the file from the root directory
    '''
@ -98,7 +96,7 @@ def get_file(vcs_link, commit_hash, is_readme):
    targetfile = ""
    for blob in commit0.tree.blobs:
        #print(type(blob.path))
-        if "README" in blob.path:
+        if "CONTRIBUTING" in blob.path:
            targetfile = blob
            #print(blob.path)
        # why would a file not be in the commit tree? but would be in the directory?
@ -124,11 +122,11 @@ def get_file(vcs_link, commit_hash, is_readme):
 def for_all_files():
    #toggle this based on readme or contributing files
-    readme_is = True
+    readme_is = False
-    csv_path = "kk_031624_pr_did.csv"
+    csv_path = "final_data/deb_contrib_did_data.csv"
    index = -1
    with open(csv_path, 'r') as file:
-        with open('a_031824_spec_errors.csv', "w") as writing_file:
+        with open('c_031824_spec_errors.csv', "w") as writing_file:
            csv_writer = csv.writer(writing_file)
            #csv_reader = csv.DictReader(file)
            lines = [line for line in file]