import json import os import csv import pandas as pd from git import Repo from tqdm import tqdm import shutil temp_dir = "/data/users/mgaughan/tmp3/" def how_many_docs(dataset_csv): df = pd.read_csv(dataset_csv) project_repos = df['upstream_vcs_link'].to_list() print(len(project_repos)) readme_count = 0 contributing_count = 0 for i in tqdm(range(len(project_repos))): vcs_link = project_repos[i] if "github" in vcs_link or "gitlab" in vcs_link: #making an evaluation that sub branches aren't being used and that people would fork if needed #this only looks at main vcs_link = "/".join(vcs_link.split("/")[0:5]) full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git" else: full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git" vcs_link = vcs_link.strip() repo = Repo.clone_from(vcs_link, full_temp_path) files = os.listdir(full_temp_path) has_readme = False has_contributing = False for file in files: if "README" in file.upper(): has_readme = True if "CONTRIBUTING" in file.upper(): has_contributing = True if has_readme: readme_count += 1 if has_contributing: contributing_count += 1 shutil.rmtree(full_temp_path, ignore_errors=True) return readme_count, contributing_count def calc_file_denom(project_name): with open('/data/users/mgaughan/kkex/contrib_uni_rosters_013124/' + 'contrib_roster_' + project_name + '.json') as file: data = json.load(file) print(len(data['api_contributors']) + len(data['issue_pr_contributors']) + len(data['file_contributors']) + len(data['wiki_contributors'])) running_roster = data['api_contributors'] for individual in data['issue_pr_contributors']: if individual not in running_roster: running_roster.append(individual) for individual in data['file_contributors']: if individual not in running_roster: running_roster.append(individual) for individual in data['wiki_contributors']: if individual not in running_roster: running_roster.append(individual) return len(running_roster) def for_all_projects(): with open('final_data/deb_octo_data.csv', newline='') as csvfile: reader = csv.DictReader(csvfile) with open('new_denom_032624.csv', 'w', newline='') as writefile: keys = ["project_name","underproduction_mean","underproduction_low","underproduction_high","debian_vcs_link","upstream_vcs_link","age_of_project","contributors","collaborators","milestone_count", "api_contrib_count", "issue_contrib_count", "file_contrib_count", "wiki_contrib_count", "contrib_denom"] writer = csv.DictWriter(writefile, fieldnames=keys) for row in reader: row['contrib_denom'] = calc_file_denom(row['project_name']) #print(row) writer.writerow(row) if __name__ == "__main__": #for_all_projects() #print(calc_file_denom("zzz-to-char")) readmec, contributingc = how_many_docs("final_data/deb_full_data.csv") print("README COUNT: " + str(readmec) + "|| CONTRIBUTING COUNT: " + str(contributingc))