24_deb_pkg_gov/non_coding_measures_scrape.py

import pexpect
import os
import json
import sys
import subprocess
import csv
import ast
import time


#sys.path.append('../octohatrack')
#import octohatrack.__main__ as oh
csv_path = "c_021824_octo_data.csv"
csv_013024_path = "c_021824_te_rows.csv"

def main():
    wd = os.getcwd()
    with open(csv_path, 'w', newline='') as output_file:
        keys = ["project_name","underproduction_mean","underproduction_low","underproduction_high","debian_vcs_link","upstream_vcs_link","age_of_project","contributors","collaborators","milestone_count", "api_contrib_count", "issue_contrib_count", "file_contrib_count", "wiki_contrib_count", ]
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        with open(csv_013024_path, 'w', newline='') as output_file:
            keys = ["project_name","underproduction_mean","underproduction_low","underproduction_high","debian_vcs_link","upstream_vcs_link","age_of_project","contributors","collaborators","milestone_count"]
            dict_writer2 = csv.DictWriter(output_file, keys)
            dict_writer2.writeheader()
            with open("c_020524_te_rows.csv", "r") as file:
            #with open("013024_te_rows.csv", "r") as file:
                reader = csv.reader(file)
                for i, line in enumerate(reader):
                    #time.sleep(45)
                    project_dict ={}
                    if "github" not in line[5]:
                        continue
                    repo_name = line[5].strip()[19:]
                    if repo_name[-4:] == ".git":
                        repo_name = repo_name[:-4]
                    print(repo_name)
                    project_dict["project_name"] = repo_name.split("/")[1]
                    project_dict['underproduction_mean'] = line[1]
                    project_dict['underproduction_low'] = line[2]
                    project_dict['underproduction_high'] = line[3]
                    project_dict['debian_vcs_link'] = line[4]
                    project_dict['upstream_vcs_link'] = line[5]
                    project_dict['age_of_project'] = line[6]
                    project_dict['contributors'] = line[7]
                    project_dict['collaborators'] = line[8]
                    project_dict['milestone_count'] = line[9]
                    #print(project_dict)
                    os.chdir(wd)
                    os.chdir("../octohatrack")
                    ##--- FAILS FOR:
                    ## - not github hosted
                    try:
                        octohatrack_results = subprocess.run(['python3', '-m', 'octohatrack', repo_name, '--wait-for-reset'], capture_output = True, text=True, timeout=900).stdout
                    except (subprocess.TimeoutExpired, TypeError) as e:
                        dict_writer2.writerow(project_dict)
                        print(e)
                        continue
                    os.chdir(wd)
                    split_results = octohatrack_results.split("\n")
                    for entry in split_results:
                        if "{'api_contributors'" in entry:
                            formatted = ast.literal_eval(entry)
                            #print(formatted)
                            project_dict["api_contrib_count"] = len(formatted['api_contributors'])
                            #project_dict["api_contrib_list"] = formatted['api_contributors']
                            project_dict["issue_contrib_count"] = len(formatted['issue_pr_contributors'])
                            #project_dict["issue_contrib_list"] = formatted['issue_pr_contributors']
                            project_dict["file_contrib_count"] = len(formatted['file_contributors'])
                            #project_dict["file_contrib_list"] = formatted['file_contributors']
                            project_dict["wiki_contrib_count"] = len(formatted['wiki_contributors'])
                            #project_dict["wiki_contrib_list"] = formatted['file_contributors']
                    print(project_dict)
                    dict_writer.writerow(project_dict)
                    with open('/data/users/mgaughan/d_kkex_contrib_uni_013124/' + 'contrib_roster_' + project_dict["project_name"] + '.json', 'w') as data_path:
                        json.dump(formatted, data_path)

def consolidate_rosters():
    rosters_dir = "/data/users/mgaughan/c_kkex_contrib_uni_013124/"
    with open("013024_octo_data.csv", 'r') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            project_name = row['upstream_vcs_link'].strip()[19:].split("/")[1]
            if project_name[-4:] == ".git":
                project_name = project_name[:-4]
            for filename in os.listdir(rosters_dir):
                if filename == 'contrib_roster_' + project_name + '.json':
                    with open(rosters_dir + filename, "r") as file:
                        d = json.load(file)
                        api_contrib_count = len(d["api_contributors"])
                        issue_contrib_count = len(d["issue_pr_contributors"])
                        file_contrib_count = len(d["file_contributors"])
                        wiki_contrib_count = len(d["wiki_contributors"])
                        print(str(api_contrib_count) + " | " + str(issue_contrib_count) + " | " + str(file_contrib_count) + " | " + str(wiki_contrib_count) + " | ")
                        row["api_contrib_count"] = api_contrib_count
                        row["issue_contrib_count"] = issue_contrib_count
                        row["file_contrib_count"] = file_contrib_count
                        row["wiki_contrib_count"] = wiki_contrib_count
                    print("match!")
                    print(filename)
            print(row)


def for_single_project(repo_name, project_name):
    try:
        octohatrack_results = subprocess.run(['python3', '-m', 'octohatrack', repo_name, '--wait-for-reset'], capture_output = True, text=True, timeout=900).stdout
    except (subprocess.TimeoutExpired, TypeError) as e:
        print(e)
        return
    split_results = octohatrack_results.split("\n")
    print(split_results)
    for entry in split_results:
        if "{'api_contributors'" in entry:
            formatted = ast.literal_eval(entry)
    with open('/data/users/mgaughan/kkex/contrib_uni_rosters_013124/' + 'contrib_roster_' + project_name + '.json', 'w') as data_path:
        json.dump(formatted, data_path)


if __name__ == "__main__":
    for_single_project("agateau/yokadi", "yokadi")