24_deb_pkg_gov/text_analysis/partitioning_files.py

95 lines
3.9 KiB
Python
Raw Normal View History

2024-05-08 17:09:00 +00:00
import csv
import io
import shutil
import os
from tqdm import tqdm
from pathlib import Path
working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2/"
dest_dir = "/data/users/mgaughan/kkex/time_specific_files/dwo_partitioned_contributing/p"
2024-05-08 17:09:00 +00:00
def move_file(vcs_link, group):
if "github" in vcs_link or "gitlab" in vcs_link:
#making an evaluation that sub branches aren't being used and that people would fork if needed
#this only looks at main
vcs_link = "/".join(vcs_link.split("/")[0:5])
2024-05-13 00:15:53 +00:00
full_temp_path = vcs_link.split('/')[4]
2024-05-08 17:09:00 +00:00
#other_temp_path = vcs_link.split('/')[4] + ".git0"
else:
2024-05-13 00:15:53 +00:00
full_temp_path = vcs_link.split('/')[- 1]
2024-05-08 17:09:00 +00:00
#other_temp_path = vcs_link.split('/')[- 1] + ".git0"
rel_filename = ""
for filename in os.listdir(working_dir):
spec, rel_filename = hard_codes(vcs_link, filename)
project = "_".join(filename.split("_")[:-1])
#print(project)
2024-05-13 00:15:53 +00:00
if full_temp_path == project:
#print(full_temp_path, project)
2024-05-08 17:09:00 +00:00
rel_filename = filename
if rel_filename != "":
target_path = working_dir + rel_filename
#print(target_path)
destination_path = dest_dir + str(group) + "/"
2024-05-13 00:15:53 +00:00
#print(target_path, full_temp_path[:-4], filename, destination_path)
2024-05-08 17:09:00 +00:00
shutil.move(target_path, destination_path)
2024-05-13 00:15:53 +00:00
return filename
else:
print("error!")
print(full_temp_path)
2024-05-08 17:09:00 +00:00
def hard_codes(vcs_link, filename):
if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/sleuthkit/sleuthkit" and filename == "sleuthkit_README_win32.txt":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/metlov/cycle.git" and filename == "cycle.git_README_ru.html":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/winchen/engauge_debian" and filename == "engauge_debian_README_for_osx":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/babelouest/yder" and filename == "yder_README_8md.html":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/SebWouters/CheMPS2" and filename == "CheMPS2_README_8md_source.html":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/TACC/Lmod" and filename == "Lmod_README_lua_modulefiles.txt":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/hunspell/hyphen.git" and filename == "hyphen.git_README_hyph_en_US.txt":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/greenbone/openvas" and filename == "openvas_UPGRADE_README":
rel_filename = filename
return True, rel_filename
if vcs_link == "https://github.com/MightyCreak/diffuse.git" and filename == "diffuse.git_README_ru":
rel_filename = filename
return True, rel_filename
return False, ""
def for_all_files(csv_path):
with open(csv_path, 'r') as file:
lines = [line for line in file]
index = -1
2024-05-13 00:15:53 +00:00
already_seen = []
2024-05-08 17:09:00 +00:00
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
if index == 0:
continue
2024-05-13 00:15:53 +00:00
#print(row[3], row[-2])
filename = move_file(row[3], row[-2])
if filename not in already_seen:
already_seen.append(filename)
else:
print(filename)
break
print(len(already_seen))
2024-05-08 17:09:00 +00:00
if __name__ == "__main__":
for_all_files("final_data/deb_contrib_interaction_groupings.csv")
2024-05-13 00:15:53 +00:00
#for_all_files("R/051224_contrib_grouped.csv")