From 8c22c87afd64b61c13e8f3327a46cf9c4746fe5a Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Sun, 12 May 2024 19:15:53 -0500 Subject: [PATCH] not sure why, paritioning fixed --- text_analysis/partitioning_files.py | 30 ++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/text_analysis/partitioning_files.py b/text_analysis/partitioning_files.py index 7303d6f..85f98f1 100644 --- a/text_analysis/partitioning_files.py +++ b/text_analysis/partitioning_files.py @@ -6,31 +6,38 @@ from tqdm import tqdm from pathlib import Path -working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/1/" -dest_dir = "/data/users/mgaughan/kkex/time_specific_files/readme3/test/" +working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/" +dest_dir = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme/p" def move_file(vcs_link, group): if "github" in vcs_link or "gitlab" in vcs_link: #making an evaluation that sub branches aren't being used and that people would fork if needed #this only looks at main vcs_link = "/".join(vcs_link.split("/")[0:5]) - full_temp_path = vcs_link.split('/')[4] + ".git" + full_temp_path = vcs_link.split('/')[4] #other_temp_path = vcs_link.split('/')[4] + ".git0" else: - full_temp_path = vcs_link.split('/')[- 1] + ".git" + full_temp_path = vcs_link.split('/')[- 1] #other_temp_path = vcs_link.split('/')[- 1] + ".git0" rel_filename = "" for filename in os.listdir(working_dir): spec, rel_filename = hard_codes(vcs_link, filename) project = "_".join(filename.split("_")[:-1]) #print(project) - if full_temp_path[:-4] == project: + if full_temp_path == project: + #print(full_temp_path, project) rel_filename = filename if rel_filename != "": target_path = working_dir + rel_filename #print(target_path) destination_path = dest_dir + str(group) + "/" + #print(target_path, full_temp_path[:-4], filename, destination_path) shutil.move(target_path, destination_path) + return filename + else: + print("error!") + print(full_temp_path) + def hard_codes(vcs_link, filename): if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump": @@ -69,11 +76,20 @@ def for_all_files(csv_path): with open(csv_path, 'r') as file: lines = [line for line in file] index = -1 + already_seen = [] for row in tqdm(csv.reader(lines), total=len(lines)): index += 1 if index == 0: continue - move_file(row[0], index % 2 + 1) + #print(row[3], row[-2]) + filename = move_file(row[3], row[-2]) + if filename not in already_seen: + already_seen.append(filename) + else: + print(filename) + break + print(len(already_seen)) if __name__ == "__main__": - for_all_files("final_data/deb_readme_did.csv") \ No newline at end of file + for_all_files("R/051224_readme_grouped.csv") + #for_all_files("R/051224_contrib_grouped.csv") \ No newline at end of file