24_deb_pkg_gov/cleaning_scripts/cleaning_contrib_files.py
2024-04-02 18:16:50 -05:00

67 lines
2.6 KiB
Python

import csv
import os
import json
import pandas as pd
def csv_count():
with open("cleaned_0118_uni_constrib.csv", "w") as writefile:
keys = ["project_name", "project_owner", "api_contrib_count", "issue_contrib_count", "file_contrib_count", "wiki_contrib_count"]
writer = csv.writer(writefile)
writer.writerow(keys)
with open("011824_uni_contrib.csv", "r") as file:
reader = csv.reader(file)
true_rep_counter = 0
for i, line in enumerate(reader):
if line[2] == line[3] == line[4] == line[5] == '0':
print("zeroes")
else:
writer.writerow(line)
print(line)
true_rep_counter += 1
print(true_rep_counter)
def checking_cross():
with open("cleaned_0118_uni_constrib.csv", "r") as readfile:
reader = csv.reader(readfile)
checking_sum = 0
for i, line in enumerate(reader):
if os.path.exists("/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + line[0] + "_inst.md" ):
checking_sum += 1
if os.path.exists("/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" + line[0] + ".git_inst.md" ):
checking_sum += 1
print(checking_sum)
def consolidate_csv():
contributor_count_csv = pd.read_csv("cleaned_0118_uni_contrib.csv")
print(contributor_count_csv.head())
total_underprod_csv = pd.read_csv("expanded_data_final.csv")
print(total_underprod_csv.head())
columns = list(total_underprod_csv.columns)
columns.extend(["api_contrib_count", "issue_contrib_count", "file_contrib_count", "wiki_contrib_count"])
list_of_links = total_underprod_csv["upstream_vcs_link"].tolist()
count = 0
with open("octo_data_total.csv", 'w', newline='') as output_file:
writer = csv.writer(output_file, columns)
writer.writerow(columns)
for index, row in contributor_count_csv.iterrows():
row_value = []
string_value = row['project_owner'] + "/" + row['project_name']
for item in list_of_links:
if string_value in item:
row_value = total_underprod_csv.loc[total_underprod_csv["upstream_vcs_link"] == item].values.tolist()[0]
row_value.extend([row["api_contrib_count"], row["issue_contrib_count"], row["file_contrib_count"], row["wiki_contrib_count"]])
print(row_value)
if len(row_value) > 4:
writer.writerow(row_value)
count += 1
print(count)
if __name__ == "__main__":
consolidate_csv()