diff --git a/non_coding_measures_scrape.py b/non_coding_measures_scrape.py index 83de594..62ea593 100644 --- a/non_coding_measures_scrape.py +++ b/non_coding_measures_scrape.py @@ -5,51 +5,76 @@ import sys import subprocess import csv import ast +import time #sys.path.append('../octohatrack') #import octohatrack.__main__ as oh -csv_path = "011824_uni_contrib.csv" +csv_path = "013024_octo_data.csv" +csv_013024_path = "013024_te_rows.csv" def main(): wd = os.getcwd() with open(csv_path, 'w', newline='') as output_file: - keys = ["project_name", "project_owner", "api_contrib_count", "issue_contrib_count", "file_contrib_count", "wiki_contrib_count"] + keys = ["project_name","underproduction_mean","underproduction_low","underproduction_high","debian_vcs_link","upstream_vcs_link","age_of_project","contributors","collaborators","milestone_count", "api_contrib_count", "issue_contrib_count", "file_contrib_count", "wiki_contrib_count", ] dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() - with open("expanded_data_final.csv", "r") as file: - reader = csv.reader(file) - for i, line in enumerate(reader): - project_dict ={} - if "github" not in line[5]: - continue - repo_name = line[5].strip()[19:] - print(repo_name) - project_dict["project_name"] = repo_name.split("/")[1] - project_dict["project_owner"]= repo_name.split("/")[0] - os.chdir("../octohatrack") - try: - octohatrack_results = subprocess.run(['python3', '-m', 'octohatrack', repo_name, '--wait-for-reset'], capture_output = True, text=True, timeout=60).stdout - except: - print("issue with the repository, string indices must be integers") - continue - os.chdir(wd) - split_results = octohatrack_results.split("\n") - for entry in split_results: - if "{'api_contributors'" in entry: - formatted = ast.literal_eval(entry) - project_dict["api_contrib_count"] = len(formatted['api_contributors']) - #project_dict["api_contrib_list"] = formatted['api_contributors'] - project_dict["issue_contrib_count"] = len(formatted['issue_pr_contributors']) - #project_dict["issue_contrib_list"] = formatted['issue_pr_contributors'] - project_dict["file_contrib_count"] = len(formatted['file_contributors']) - #project_dict["file_contrib_list"] = formatted['file_contributors'] - project_dict["wiki_contrib_count"] = len(formatted['file_contributors']) - #project_dict["wiki_contrib_list"] = formatted['file_contributors'] - print(project_dict) - dict_writer.writerow(project_dict) - with open('/data/users/mgaughan/b_kkex_contrib_uni_011824/' + 'contrib_roster_' + project_dict["project_name"] + '.json', 'w') as data_path: - json.dump(formatted, data_path) + with open(csv_013024_path, 'w', newline='') as output_file: + keys = ["project_name","underproduction_mean","underproduction_low","underproduction_high","debian_vcs_link","upstream_vcs_link","age_of_project","contributors","collaborators","milestone_count"] + dict_writer2 = csv.DictWriter(output_file, keys) + dict_writer2.writeheader() + with open("kk_final_expanded_data_final.csv", "r") as file: + #with open("013024_te_rows.csv", "r") as file: + reader = csv.reader(file) + for i, line in enumerate(reader): + time.sleep(5) + project_dict ={} + if "github" not in line[5]: + continue + repo_name = line[5].strip()[19:] + if repo_name[-4:] == ".git": + repo_name = repo_name[:-4] + print(repo_name) + project_dict["project_name"] = repo_name.split("/")[1] + project_dict['underproduction_mean'] = line[1] + project_dict['underproduction_low'] = line[2] + project_dict['underproduction_high'] = line[3] + project_dict['debian_vcs_link'] = line[4] + project_dict['upstream_vcs_link'] = line[5] + project_dict['age_of_project'] = line[6] + project_dict['contributors'] = line[7] + project_dict['collaborators'] = line[8] + project_dict['milestone_count'] = line[9] + #print(project_dict) + os.chdir(wd) + os.chdir("../octohatrack") + ##--- FAILS FOR: + ## - archived GitHub things + ## - not github hosted + try: + octohatrack_results = subprocess.run(['python3', '-m', 'octohatrack', repo_name], capture_output = True, text=True, timeout=60).stdout + except (subprocess.TimeoutExpired, TypeError) as e: + dict_writer2.writerow(project_dict) + print(e) + continue + os.chdir(wd) + split_results = octohatrack_results.split("\n") + for entry in split_results: + if "{'api_contributors'" in entry: + formatted = ast.literal_eval(entry) + #print(formatted) + project_dict["api_contrib_count"] = len(formatted['api_contributors']) + #project_dict["api_contrib_list"] = formatted['api_contributors'] + project_dict["issue_contrib_count"] = len(formatted['issue_pr_contributors']) + #project_dict["issue_contrib_list"] = formatted['issue_pr_contributors'] + project_dict["file_contrib_count"] = len(formatted['file_contributors']) + #project_dict["file_contrib_list"] = formatted['file_contributors'] + project_dict["wiki_contrib_count"] = len(formatted['wiki_contributors']) + #project_dict["wiki_contrib_list"] = formatted['file_contributors'] + print(project_dict) + dict_writer.writerow(project_dict) + with open('/data/users/mgaughan/b_kkex_contrib_uni_013024/' + 'contrib_roster_' + project_dict["project_name"] + '.json', 'w') as data_path: + json.dump(formatted, data_path) if __name__ == "__main__": main() \ No newline at end of file