77 lines
3.3 KiB
Python
77 lines
3.3 KiB
Python
import json
|
|
import os
|
|
import csv
|
|
import pandas as pd
|
|
from git import Repo
|
|
from tqdm import tqdm
|
|
import shutil
|
|
|
|
temp_dir = "/data/users/mgaughan/tmp3/"
|
|
|
|
def how_many_docs(dataset_csv):
|
|
df = pd.read_csv(dataset_csv)
|
|
project_repos = df['upstream_vcs_link'].to_list()
|
|
print(len(project_repos))
|
|
readme_count = 0
|
|
contributing_count = 0
|
|
for i in tqdm(range(len(project_repos))):
|
|
vcs_link = project_repos[i]
|
|
if "github" in vcs_link or "gitlab" in vcs_link:
|
|
#making an evaluation that sub branches aren't being used and that people would fork if needed
|
|
#this only looks at main
|
|
vcs_link = "/".join(vcs_link.split("/")[0:5])
|
|
full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
|
|
else:
|
|
full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
|
|
vcs_link = vcs_link.strip()
|
|
repo = Repo.clone_from(vcs_link, full_temp_path)
|
|
files = os.listdir(full_temp_path)
|
|
has_readme = False
|
|
has_contributing = False
|
|
for file in files:
|
|
if "README" in file.upper():
|
|
has_readme = True
|
|
if "CONTRIBUTING" in file.upper():
|
|
has_contributing = True
|
|
if has_readme:
|
|
readme_count += 1
|
|
if has_contributing:
|
|
contributing_count += 1
|
|
shutil.rmtree(full_temp_path, ignore_errors=True)
|
|
return readme_count, contributing_count
|
|
|
|
|
|
|
|
def calc_file_denom(project_name):
|
|
with open('/data/users/mgaughan/kkex/contrib_uni_rosters_013124/' + 'contrib_roster_' + project_name + '.json') as file:
|
|
data = json.load(file)
|
|
print(len(data['api_contributors']) + len(data['issue_pr_contributors']) + len(data['file_contributors']) + len(data['wiki_contributors']))
|
|
running_roster = data['api_contributors']
|
|
for individual in data['issue_pr_contributors']:
|
|
if individual not in running_roster:
|
|
running_roster.append(individual)
|
|
for individual in data['file_contributors']:
|
|
if individual not in running_roster:
|
|
running_roster.append(individual)
|
|
for individual in data['wiki_contributors']:
|
|
if individual not in running_roster:
|
|
running_roster.append(individual)
|
|
return len(running_roster)
|
|
|
|
def for_all_projects():
|
|
with open('final_data/deb_octo_data.csv', newline='') as csvfile:
|
|
reader = csv.DictReader(csvfile)
|
|
with open('new_denom_032624.csv', 'w', newline='') as writefile:
|
|
keys = ["project_name","underproduction_mean","underproduction_low","underproduction_high","debian_vcs_link","upstream_vcs_link","age_of_project","contributors","collaborators","milestone_count", "api_contrib_count", "issue_contrib_count", "file_contrib_count", "wiki_contrib_count", "contrib_denom"]
|
|
writer = csv.DictWriter(writefile, fieldnames=keys)
|
|
for row in reader:
|
|
row['contrib_denom'] = calc_file_denom(row['project_name'])
|
|
#print(row)
|
|
writer.writerow(row)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
#for_all_projects()
|
|
#print(calc_file_denom("zzz-to-char"))
|
|
readmec, contributingc = how_many_docs("final_data/deb_full_data.csv")
|
|
print("README COUNT: " + str(readmec) + "|| CONTRIBUTING COUNT: " + str(contributingc)) |