24_deb_pkg_gov/cleaning_scripts/cleaning_did_data.py
2024-04-02 18:16:50 -05:00

38 lines
1.9 KiB
Python

import csv
import pandas as pd
import os
# the repo of files is the record of what projects we have the specific files for
# use the repo of files to then match back to the DiD data
temp_dir = "/data/users/mgaughan/tmp3/"
with open("final_contributing_did.csv", "w") as writing_file:
csv_writer = csv.writer(writing_file)
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/contributing/")]:
file_project = "".join(filename.split("_")[:-1])
with open("final_data/deb_contrib_did_data.csv", "r") as file1:
reader_obj = csv.reader(file1)
for line in reader_obj:
if line[0] == "":
continue
if "github" in line[0] or "gitlab" in line[0]:
#making an evaluation that sub branches aren't being used and that people would fork if needed
#this only looks at main
temp_vcs = "/".join(line[0].split("/")[0:5])
project_name = temp_vcs.split('/')[4]
else:
project_name = temp_dir + line[0].split('/')[- 1]
if file_project == project_name:
csv_writer.writerow(line)
break
'''
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
file_project = "".join(filename.split("_")[:-1])
for filename2 in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
file_project2 = "".join(filename2.split("_")[:-1])
if filename != filename2 and file_project == file_project2:
os.remove("/data/users/mgaughan/kkex/time_specific_files/readme/readme/" + filename2)
'''