not sure why, paritioning fixed

This commit is contained in:
Matthew Gaughan 2024-05-12 19:15:53 -05:00
parent 38e845ec90
commit 8c22c87afd

View File

@ -6,31 +6,38 @@ from tqdm import tqdm
from pathlib import Path from pathlib import Path
working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/1/" working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/"
dest_dir = "/data/users/mgaughan/kkex/time_specific_files/readme3/test/" dest_dir = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme/p"
def move_file(vcs_link, group): def move_file(vcs_link, group):
if "github" in vcs_link or "gitlab" in vcs_link: if "github" in vcs_link or "gitlab" in vcs_link:
#making an evaluation that sub branches aren't being used and that people would fork if needed #making an evaluation that sub branches aren't being used and that people would fork if needed
#this only looks at main #this only looks at main
vcs_link = "/".join(vcs_link.split("/")[0:5]) vcs_link = "/".join(vcs_link.split("/")[0:5])
full_temp_path = vcs_link.split('/')[4] + ".git" full_temp_path = vcs_link.split('/')[4]
#other_temp_path = vcs_link.split('/')[4] + ".git0" #other_temp_path = vcs_link.split('/')[4] + ".git0"
else: else:
full_temp_path = vcs_link.split('/')[- 1] + ".git" full_temp_path = vcs_link.split('/')[- 1]
#other_temp_path = vcs_link.split('/')[- 1] + ".git0" #other_temp_path = vcs_link.split('/')[- 1] + ".git0"
rel_filename = "" rel_filename = ""
for filename in os.listdir(working_dir): for filename in os.listdir(working_dir):
spec, rel_filename = hard_codes(vcs_link, filename) spec, rel_filename = hard_codes(vcs_link, filename)
project = "_".join(filename.split("_")[:-1]) project = "_".join(filename.split("_")[:-1])
#print(project) #print(project)
if full_temp_path[:-4] == project: if full_temp_path == project:
#print(full_temp_path, project)
rel_filename = filename rel_filename = filename
if rel_filename != "": if rel_filename != "":
target_path = working_dir + rel_filename target_path = working_dir + rel_filename
#print(target_path) #print(target_path)
destination_path = dest_dir + str(group) + "/" destination_path = dest_dir + str(group) + "/"
#print(target_path, full_temp_path[:-4], filename, destination_path)
shutil.move(target_path, destination_path) shutil.move(target_path, destination_path)
return filename
else:
print("error!")
print(full_temp_path)
def hard_codes(vcs_link, filename): def hard_codes(vcs_link, filename):
if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump": if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump":
@ -69,11 +76,20 @@ def for_all_files(csv_path):
with open(csv_path, 'r') as file: with open(csv_path, 'r') as file:
lines = [line for line in file] lines = [line for line in file]
index = -1 index = -1
already_seen = []
for row in tqdm(csv.reader(lines), total=len(lines)): for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1 index += 1
if index == 0: if index == 0:
continue continue
move_file(row[0], index % 2 + 1) #print(row[3], row[-2])
filename = move_file(row[3], row[-2])
if filename not in already_seen:
already_seen.append(filename)
else:
print(filename)
break
print(len(already_seen))
if __name__ == "__main__": if __name__ == "__main__":
for_all_files("final_data/deb_readme_did.csv") for_all_files("R/051224_readme_grouped.csv")
#for_all_files("R/051224_contrib_grouped.csv")