24_deb_pkg_gov/cleaning_scripts/cleaning_did_data.py

import csv
import pandas as pd
import os

# the repo of files is the record of what projects we have the specific files for
# use the repo of files to then match back to the DiD data

temp_dir = "/data/users/mgaughan/tmp3/"
with open("final_contributing_did.csv", "w") as writing_file:
        csv_writer = csv.writer(writing_file)
        for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/contributing/")]:
            file_project = "".join(filename.split("_")[:-1])
            with open("final_data/deb_contrib_did_data.csv", "r") as file1:
                reader_obj = csv.reader(file1)
                for line in reader_obj:
                    if line[0] == "":
                        continue
                    if "github" in line[0] or "gitlab" in line[0]:
                        #making an evaluation that sub branches aren't being used and that people would fork if needed
                        #this only looks at main
                        temp_vcs = "/".join(line[0].split("/")[0:5])
                        project_name = temp_vcs.split('/')[4]
                    else:
                        project_name = temp_dir + line[0].split('/')[- 1]
                    if file_project == project_name:
                        csv_writer.writerow(line)
                        break

'''
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
    file_project = "".join(filename.split("_")[:-1])
    for filename2 in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
        file_project2 = "".join(filename2.split("_")[:-1])
        if filename != filename2 and file_project == file_project2:
            os.remove("/data/users/mgaughan/kkex/time_specific_files/readme/readme/" + filename2)
'''