not sure why, paritioning fixed
This commit is contained in:
parent
38e845ec90
commit
8c22c87afd
@ -6,31 +6,38 @@ from tqdm import tqdm
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/1/"
|
working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2/"
|
||||||
dest_dir = "/data/users/mgaughan/kkex/time_specific_files/readme3/test/"
|
dest_dir = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme/p"
|
||||||
|
|
||||||
def move_file(vcs_link, group):
|
def move_file(vcs_link, group):
|
||||||
if "github" in vcs_link or "gitlab" in vcs_link:
|
if "github" in vcs_link or "gitlab" in vcs_link:
|
||||||
#making an evaluation that sub branches aren't being used and that people would fork if needed
|
#making an evaluation that sub branches aren't being used and that people would fork if needed
|
||||||
#this only looks at main
|
#this only looks at main
|
||||||
vcs_link = "/".join(vcs_link.split("/")[0:5])
|
vcs_link = "/".join(vcs_link.split("/")[0:5])
|
||||||
full_temp_path = vcs_link.split('/')[4] + ".git"
|
full_temp_path = vcs_link.split('/')[4]
|
||||||
#other_temp_path = vcs_link.split('/')[4] + ".git0"
|
#other_temp_path = vcs_link.split('/')[4] + ".git0"
|
||||||
else:
|
else:
|
||||||
full_temp_path = vcs_link.split('/')[- 1] + ".git"
|
full_temp_path = vcs_link.split('/')[- 1]
|
||||||
#other_temp_path = vcs_link.split('/')[- 1] + ".git0"
|
#other_temp_path = vcs_link.split('/')[- 1] + ".git0"
|
||||||
rel_filename = ""
|
rel_filename = ""
|
||||||
for filename in os.listdir(working_dir):
|
for filename in os.listdir(working_dir):
|
||||||
spec, rel_filename = hard_codes(vcs_link, filename)
|
spec, rel_filename = hard_codes(vcs_link, filename)
|
||||||
project = "_".join(filename.split("_")[:-1])
|
project = "_".join(filename.split("_")[:-1])
|
||||||
#print(project)
|
#print(project)
|
||||||
if full_temp_path[:-4] == project:
|
if full_temp_path == project:
|
||||||
|
#print(full_temp_path, project)
|
||||||
rel_filename = filename
|
rel_filename = filename
|
||||||
if rel_filename != "":
|
if rel_filename != "":
|
||||||
target_path = working_dir + rel_filename
|
target_path = working_dir + rel_filename
|
||||||
#print(target_path)
|
#print(target_path)
|
||||||
destination_path = dest_dir + str(group) + "/"
|
destination_path = dest_dir + str(group) + "/"
|
||||||
|
#print(target_path, full_temp_path[:-4], filename, destination_path)
|
||||||
shutil.move(target_path, destination_path)
|
shutil.move(target_path, destination_path)
|
||||||
|
return filename
|
||||||
|
else:
|
||||||
|
print("error!")
|
||||||
|
print(full_temp_path)
|
||||||
|
|
||||||
|
|
||||||
def hard_codes(vcs_link, filename):
|
def hard_codes(vcs_link, filename):
|
||||||
if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump":
|
if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump":
|
||||||
@ -69,11 +76,20 @@ def for_all_files(csv_path):
|
|||||||
with open(csv_path, 'r') as file:
|
with open(csv_path, 'r') as file:
|
||||||
lines = [line for line in file]
|
lines = [line for line in file]
|
||||||
index = -1
|
index = -1
|
||||||
|
already_seen = []
|
||||||
for row in tqdm(csv.reader(lines), total=len(lines)):
|
for row in tqdm(csv.reader(lines), total=len(lines)):
|
||||||
index += 1
|
index += 1
|
||||||
if index == 0:
|
if index == 0:
|
||||||
continue
|
continue
|
||||||
move_file(row[0], index % 2 + 1)
|
#print(row[3], row[-2])
|
||||||
|
filename = move_file(row[3], row[-2])
|
||||||
|
if filename not in already_seen:
|
||||||
|
already_seen.append(filename)
|
||||||
|
else:
|
||||||
|
print(filename)
|
||||||
|
break
|
||||||
|
print(len(already_seen))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
for_all_files("final_data/deb_readme_did.csv")
|
for_all_files("R/051224_readme_grouped.csv")
|
||||||
|
#for_all_files("R/051224_contrib_grouped.csv")
|
Loading…
Reference in New Issue
Block a user