24_deb_pkg_gov/cleaning_did_data.py

import csv
import pandas as pd
import os


temp_dir = "/data/users/mgaughan/tmp3/"
with open("final_readme_did.csv", "w") as writing_file:
        csv_writer = csv.writer(writing_file)
        for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]:   
            file_project = "".join(filename.split("_")[:-1])
            with open("kk_031624_pr_did.csv", "r") as file1: 
                reader_obj = csv.reader(file1)
                for line in reader_obj:
                    if line[0] == "":
                        continue 
                    if "github" in line[0] or "gitlab" in line[0]:
                        #making an evaluation that sub branches aren't being used and that people would fork if needed
                        #this only looks at main
                        temp_vcs = "/".join(line[0].split("/")[0:5])
                        project_name = temp_vcs.split('/')[4] 
                    else:
                        project_name = temp_dir + line[0].split('/')[- 1]
                    if file_project == project_name:
                        csv_writer.writerow(line)
                        
'''
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
    file_project = "".join(filename.split("_")[:-1])
    for filename2 in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
        file_project2 = "".join(filename2.split("_")[:-1])
        if filename != filename2 and file_project == file_project2:
            os.remove("/data/users/mgaughan/kkex/time_specific_files/readme/readme/" + filename2)
'''
backup, cleaning files 2024-03-31 21:38:56 +00:00			`import csv`
			`import pandas as pd`
			`import os`


			`temp_dir = "/data/users/mgaughan/tmp3/"`
			`with open("final_readme_did.csv", "w") as writing_file:`
			`csv_writer = csv.writer(writing_file)`
			`for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]:`
			`file_project = "".join(filename.split("_")[:-1])`
			`with open("kk_031624_pr_did.csv", "r") as file1:`
			`reader_obj = csv.reader(file1)`
			`for line in reader_obj:`
			`if line[0] == "":`
			`continue`
			`if "github" in line[0] or "gitlab" in line[0]:`
			`#making an evaluation that sub branches aren't being used and that people would fork if needed`
			`#this only looks at main`
			`temp_vcs = "/".join(line[0].split("/")[0:5])`
			`project_name = temp_vcs.split('/')[4]`
			`else:`
			`project_name = temp_dir + line[0].split('/')[- 1]`
			`if file_project == project_name:`
			`csv_writer.writerow(line)`

			`'''`
			`for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:`
			`file_project = "".join(filename.split("_")[:-1])`
			`for filename2 in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:`
			`file_project2 = "".join(filename2.split("_")[:-1])`
			`if filename != filename2 and file_project == file_project2:`
			`os.remove("/data/users/mgaughan/kkex/time_specific_files/readme/readme/" + filename2)`
			`'''`