24_deb_pkg_gov/text_analysis/partitioning_files.py

import csv
import io 
import shutil 
import os 
from tqdm import tqdm
from pathlib import Path


working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2/"
dest_dir = "/data/users/mgaughan/kkex/time_specific_files/dwo_partitioned_contributing/p"

def move_file(vcs_link, group):
    if "github" in vcs_link or "gitlab" in vcs_link:
        #making an evaluation that sub branches aren't being used and that people would fork if needed
        #this only looks at main
        vcs_link = "/".join(vcs_link.split("/")[0:5])
        full_temp_path = vcs_link.split('/')[4] 
        #other_temp_path = vcs_link.split('/')[4] + ".git0"
    else:
        full_temp_path = vcs_link.split('/')[- 1] 
        #other_temp_path = vcs_link.split('/')[- 1] + ".git0"
    rel_filename = ""
    for filename in os.listdir(working_dir):
        spec, rel_filename = hard_codes(vcs_link, filename)
        project = "_".join(filename.split("_")[:-1])
        #print(project)
        if full_temp_path == project:
            #print(full_temp_path, project)
            rel_filename = filename
        if rel_filename != "":
            target_path = working_dir + rel_filename
            #print(target_path)
            destination_path = dest_dir + str(group) + "/"
            #print(target_path, full_temp_path[:-4], filename, destination_path)
            shutil.move(target_path, destination_path)
            return filename
        else:
            print("error!")
            print(full_temp_path)
        

def hard_codes(vcs_link, filename):
    if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump":
        rel_filename = filename
        return True, rel_filename
    if vcs_link == "https://github.com/sleuthkit/sleuthkit" and filename == "sleuthkit_README_win32.txt":
        rel_filename = filename
        return True, rel_filename
    if vcs_link == "https://github.com/metlov/cycle.git" and filename == "cycle.git_README_ru.html":
        rel_filename = filename
        return True, rel_filename
    if vcs_link ==  "https://github.com/winchen/engauge_debian" and filename == "engauge_debian_README_for_osx":
        rel_filename = filename
        return True, rel_filename
    if vcs_link == "https://github.com/babelouest/yder" and filename == "yder_README_8md.html":
        rel_filename = filename
        return True, rel_filename
    if vcs_link == "https://github.com/SebWouters/CheMPS2" and filename == "CheMPS2_README_8md_source.html":
        rel_filename = filename
        return True, rel_filename
    if vcs_link == "https://github.com/TACC/Lmod" and filename == "Lmod_README_lua_modulefiles.txt":
        rel_filename = filename
        return True, rel_filename
    if vcs_link == "https://github.com/hunspell/hyphen.git" and filename == "hyphen.git_README_hyph_en_US.txt":
        rel_filename = filename
        return True, rel_filename
    if vcs_link == "https://github.com/greenbone/openvas" and filename == "openvas_UPGRADE_README":
        rel_filename = filename
        return True, rel_filename
    if vcs_link == "https://github.com/MightyCreak/diffuse.git" and filename == "diffuse.git_README_ru":
        rel_filename = filename
        return True, rel_filename
    return False, ""

def for_all_files(csv_path):
    with open(csv_path, 'r') as file:
        lines = [line for line in file]
        index = -1
        already_seen = []
        for row in tqdm(csv.reader(lines), total=len(lines)):
            index += 1
            if index == 0:
                continue
            #print(row[3], row[-2])
            filename = move_file(row[3], row[-2])
            if filename not in already_seen:
                already_seen.append(filename)
            else: 
                print(filename) 
                break
        print(len(already_seen))

if __name__ == "__main__":
    for_all_files("final_data/deb_contrib_interaction_groupings.csv")
    #for_all_files("R/051224_contrib_grouped.csv")
preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00			`import csv`
			`import io`
			`import shutil`
			`import os`
			`from tqdm import tqdm`
			`from pathlib import Path`


updating partitioned and readability metrics for contributing 2024-07-15 18:24:45 +00:00			`working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2/"`
			`dest_dir = "/data/users/mgaughan/kkex/time_specific_files/dwo_partitioned_contributing/p"`
preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00
			`def move_file(vcs_link, group):`
			`if "github" in vcs_link or "gitlab" in vcs_link:`
			`#making an evaluation that sub branches aren't being used and that people would fork if needed`
			`#this only looks at main`
			`vcs_link = "/".join(vcs_link.split("/")[0:5])`
not sure why, paritioning fixed 2024-05-13 00:15:53 +00:00			`full_temp_path = vcs_link.split('/')[4]`
preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00			`#other_temp_path = vcs_link.split('/')[4] + ".git0"`
			`else:`
not sure why, paritioning fixed 2024-05-13 00:15:53 +00:00			`full_temp_path = vcs_link.split('/')[- 1]`
preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00			`#other_temp_path = vcs_link.split('/')[- 1] + ".git0"`
			`rel_filename = ""`
			`for filename in os.listdir(working_dir):`
			`spec, rel_filename = hard_codes(vcs_link, filename)`
			`project = "_".join(filename.split("_")[:-1])`
			`#print(project)`
not sure why, paritioning fixed 2024-05-13 00:15:53 +00:00			`if full_temp_path == project:`
			`#print(full_temp_path, project)`
preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00			`rel_filename = filename`
			`if rel_filename != "":`
			`target_path = working_dir + rel_filename`
			`#print(target_path)`
			`destination_path = dest_dir + str(group) + "/"`
not sure why, paritioning fixed 2024-05-13 00:15:53 +00:00			`#print(target_path, full_temp_path[:-4], filename, destination_path)`
preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00			`shutil.move(target_path, destination_path)`
not sure why, paritioning fixed 2024-05-13 00:15:53 +00:00			`return filename`
			`else:`
			`print("error!")`
			`print(full_temp_path)`

preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00
			`def hard_codes(vcs_link, filename):`
			`if vcs_link == "https://github.com/df7cb/pg_filedump.git" and filename == "pg_filedump.git_README.pg_filedump":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/sleuthkit/sleuthkit" and filename == "sleuthkit_README_win32.txt":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/metlov/cycle.git" and filename == "cycle.git_README_ru.html":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/winchen/engauge_debian" and filename == "engauge_debian_README_for_osx":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/babelouest/yder" and filename == "yder_README_8md.html":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/SebWouters/CheMPS2" and filename == "CheMPS2_README_8md_source.html":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/TACC/Lmod" and filename == "Lmod_README_lua_modulefiles.txt":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/hunspell/hyphen.git" and filename == "hyphen.git_README_hyph_en_US.txt":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/greenbone/openvas" and filename == "openvas_UPGRADE_README":`
			`rel_filename = filename`
			`return True, rel_filename`
			`if vcs_link == "https://github.com/MightyCreak/diffuse.git" and filename == "diffuse.git_README_ru":`
			`rel_filename = filename`
			`return True, rel_filename`
			`return False, ""`

			`def for_all_files(csv_path):`
			`with open(csv_path, 'r') as file:`
			`lines = [line for line in file]`
			`index = -1`
not sure why, paritioning fixed 2024-05-13 00:15:53 +00:00			`already_seen = []`
preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00			`for row in tqdm(csv.reader(lines), total=len(lines)):`
			`index += 1`
			`if index == 0:`
			`continue`
not sure why, paritioning fixed 2024-05-13 00:15:53 +00:00			`#print(row[3], row[-2])`
			`filename = move_file(row[3], row[-2])`
			`if filename not in already_seen:`
			`already_seen.append(filename)`
			`else:`
			`print(filename)`
			`break`
			`print(len(already_seen))`
preparing for grouped topic analysis 2024-05-08 17:09:00 +00:00
			`if __name__ == "__main__":`
updating partitioned and readability metrics for contributing 2024-07-15 18:24:45 +00:00			`for_all_files("final_data/deb_contrib_interaction_groupings.csv")`
not sure why, paritioning fixed 2024-05-13 00:15:53 +00:00			`#for_all_files("R/051224_contrib_grouped.csv")`