This commit is contained in:
Matthew Gaughan 2024-03-31 17:02:21 -05:00
parent d113e0991a
commit cc590c6ecc
2 changed files with 29 additions and 36 deletions

View File

@ -2,11 +2,13 @@ import csv
import pandas as pd import pandas as pd
import os import os
# the repo of files is the record of what projects we have the specific files for
# use the repo of files to then match back to the DiD data
temp_dir = "/data/users/mgaughan/tmp3/" temp_dir = "/data/users/mgaughan/tmp3/"
with open("final_readme_did.csv", "w") as writing_file: with open("final_readme_did.csv", "w") as writing_file:
csv_writer = csv.writer(writing_file) csv_writer = csv.writer(writing_file)
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]: for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/")]:
file_project = "".join(filename.split("_")[:-1]) file_project = "".join(filename.split("_")[:-1])
with open("kk_031624_pr_did.csv", "r") as file1: with open("kk_031624_pr_did.csv", "r") as file1:
reader_obj = csv.reader(file1) reader_obj = csv.reader(file1)
@ -22,6 +24,7 @@ with open("final_readme_did.csv", "w") as writing_file:
project_name = temp_dir + line[0].split('/')[- 1] project_name = temp_dir + line[0].split('/')[- 1]
if file_project == project_name: if file_project == project_name:
csv_writer.writerow(line) csv_writer.writerow(line)
break
''' '''
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]: for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:

View File

@ -12,7 +12,7 @@ import math
import io import io
import re import re
working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing" working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2"
temp_dir = "/data/users/mgaughan/tmp3/" temp_dir = "/data/users/mgaughan/tmp3/"
# getting the specific readme or contributing file from a given commit # getting the specific readme or contributing file from a given commit
@ -48,20 +48,7 @@ def get_file(vcs_link, commit_hash, is_readme):
target_filename = file['file'].split("/")[-1] target_filename = file['file'].split("/")[-1]
else: else:
target_filename = file['file'] target_filename = file['file']
#print(commit.tree)
#getting the name of the file from the root directory
'''
target_filename = ""
for filename in os.listdir(full_temp_path):
if is_readme:
#target_filename = "README.md"
if "README" in filename or "readme" in filename:
target_filename = filename
else:
#target_filename = "CONTRIBUTING.md"
if "CONTRIBUTING" in filename or "contributing" in filename:
target_filename = filename
'''
if target_filename == "": if target_filename == "":
shutil.rmtree(full_temp_path, ignore_errors=True) shutil.rmtree(full_temp_path, ignore_errors=True)
shutil.rmtree(other_temp_path, ignore_errors=True) shutil.rmtree(other_temp_path, ignore_errors=True)
@ -80,9 +67,8 @@ def get_file(vcs_link, commit_hash, is_readme):
shutil.rmtree(full_temp_path, ignore_errors=True) shutil.rmtree(full_temp_path, ignore_errors=True)
shutil.rmtree(other_temp_path, ignore_errors=True) shutil.rmtree(other_temp_path, ignore_errors=True)
return "KeyError -- the file is not in the commit tree" return "KeyError -- the file is not in the commit tree"
if is_readme: if is_readme:
last_path = "readme" last_path = "readme2"
else: else:
last_path = "contributing" last_path = "contributing"
with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file: with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file:
@ -96,12 +82,14 @@ def get_file(vcs_link, commit_hash, is_readme):
def for_all_files(): def for_all_files():
#toggle this based on readme or contributing files #toggle this based on readme or contributing files
readme_is = False readme_is = True
csv_path = "final_data/deb_contrib_did_data.csv" csv_path = "kk_031624_pr_did.csv"
index = -1 index = -1
with open(csv_path, 'r') as file: with open(csv_path, 'r') as file:
with open('c_031824_spec_errors.csv', "w") as writing_file: with open('d_031824_spec_errors.csv', "w") as writing_file:
csv_writer = csv.writer(writing_file) csv_writer = csv.writer(writing_file)
with open("readme_completed_downloads.csv", "w") as writing_file2:
csv_writer2 = csv.writer(writing_file2)
#csv_reader = csv.DictReader(file) #csv_reader = csv.DictReader(file)
lines = [line for line in file] lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)): for row in tqdm(csv.reader(lines), total=len(lines)):
@ -114,6 +102,8 @@ def for_all_files():
return_value = get_file(row[0], row[2], readme_is) return_value = get_file(row[0], row[2], readme_is)
if return_value != "NoError": if return_value != "NoError":
csv_writer.writerow([row[0], row[2], readme_is, return_value]) csv_writer.writerow([row[0], row[2], readme_is, return_value])
else:
csv_writer2.writerow(row)
# if it is noError, just write the row down in a different csv # if it is noError, just write the row down in a different csv
# there's an issue of duplicates, but just keep it moving # there's an issue of duplicates, but just keep it moving
# if no duplicates -- just run it through # if no duplicates -- just run it through