import csv import io import shutil import os from tqdm import tqdm from pathlib import Path working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/" dest_dir = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme1/p" def move_file(vcs_link, group): if "github" in vcs_link or "gitlab" in vcs_link: #making an evaluation that sub branches aren't being used and that people would fork if needed #this only looks at main vcs_link = "/".join(vcs_link.split("/")[0:5]) full_temp_path = vcs_link.split('/')[4] #other_temp_path = vcs_link.split('/')[4] + ".git0" else: full_temp_path = vcs_link.split('/')[- 1] #other_temp_path = vcs_link.split('/')[- 1] + ".git0" rel_filename = "" for filename in os.listdir(working_dir): spec, rel_filename = hard_codes(vcs_link, filename) project = "_".join(filename.split("_")[:-1]) #print(project) if full_temp_path == project: #print(full_temp_path, project) rel_filename = filename if rel_filename != "": target_path = working_dir + rel_filename #print(target_path) destination_path = dest_dir + str(group) + "/" #print(target_path, full_temp_path[:-4], filename, destination_path) shutil.move(target_path, destination_path) return filename else: print("error!") print(full_temp_path) def hard_codes(vcs_link, filename): if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/sleuthkit/sleuthkit" and filename == "sleuthkit_README_win32.txt": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/metlov/cycle.git" and filename == "cycle.git_README_ru.html": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/winchen/engauge_debian" and filename == "engauge_debian_README_for_osx": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/babelouest/yder" and filename == "yder_README_8md.html": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/SebWouters/CheMPS2" and filename == "CheMPS2_README_8md_source.html": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/TACC/Lmod" and filename == "Lmod_README_lua_modulefiles.txt": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/hunspell/hyphen.git" and filename == "hyphen.git_README_hyph_en_US.txt": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/greenbone/openvas" and filename == "openvas_UPGRADE_README": rel_filename = filename return True, rel_filename if vcs_link == "https://github.com/MightyCreak/diffuse.git" and filename == "diffuse.git_README_ru": rel_filename = filename return True, rel_filename return False, "" def for_all_files(csv_path): with open(csv_path, 'r') as file: lines = [line for line in file] index = -1 already_seen = [] for row in tqdm(csv.reader(lines), total=len(lines)): index += 1 if index == 0: continue #print(row[3], row[-2]) filename = move_file(row[3], row[-2]) if filename not in already_seen: already_seen.append(filename) else: print(filename) break print(len(already_seen)) if __name__ == "__main__": for_all_files("final_data/contrib_rdd_groupings.csv") #for_all_files("R/051224_contrib_grouped.csv")