24_deb_pkg_gov/non_coding_measures_scrape.py

55 lines
2.7 KiB
Python
Raw Normal View History

2023-12-19 02:43:39 +00:00
import pexpect
import os
import json
import sys
import subprocess
import csv
2024-01-15 17:55:54 +00:00
import ast
2023-12-19 02:43:39 +00:00
#sys.path.append('../octohatrack')
#import octohatrack.__main__ as oh
2024-01-22 17:20:52 +00:00
csv_path = "011824_uni_contrib.csv"
2023-12-19 02:43:39 +00:00
def main():
wd = os.getcwd()
2024-01-15 17:55:54 +00:00
with open(csv_path, 'w', newline='') as output_file:
keys = ["project_name", "project_owner", "api_contrib_count", "issue_contrib_count", "file_contrib_count", "wiki_contrib_count"]
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
with open("expanded_data_final.csv", "r") as file:
reader = csv.reader(file)
for i, line in enumerate(reader):
project_dict ={}
if "github" not in line[5]:
continue
repo_name = line[5].strip()[19:]
2024-01-22 17:20:52 +00:00
print(repo_name)
2024-01-15 17:55:54 +00:00
project_dict["project_name"] = repo_name.split("/")[1]
project_dict["project_owner"]= repo_name.split("/")[0]
os.chdir("../octohatrack")
try:
2024-01-22 17:20:52 +00:00
octohatrack_results = subprocess.run(['python3', '-m', 'octohatrack', repo_name, '--wait-for-reset'], capture_output = True, text=True, timeout=60).stdout
2024-01-15 17:55:54 +00:00
except:
print("issue with the repository, string indices must be integers")
continue
os.chdir(wd)
split_results = octohatrack_results.split("\n")
for entry in split_results:
if "{'api_contributors'" in entry:
formatted = ast.literal_eval(entry)
project_dict["api_contrib_count"] = len(formatted['api_contributors'])
#project_dict["api_contrib_list"] = formatted['api_contributors']
project_dict["issue_contrib_count"] = len(formatted['issue_pr_contributors'])
#project_dict["issue_contrib_list"] = formatted['issue_pr_contributors']
project_dict["file_contrib_count"] = len(formatted['file_contributors'])
#project_dict["file_contrib_list"] = formatted['file_contributors']
project_dict["wiki_contrib_count"] = len(formatted['file_contributors'])
#project_dict["wiki_contrib_list"] = formatted['file_contributors']
2024-01-22 17:20:52 +00:00
print(project_dict)
2024-01-15 17:55:54 +00:00
dict_writer.writerow(project_dict)
2024-01-22 17:20:52 +00:00
with open('/data/users/mgaughan/b_kkex_contrib_uni_011824/' + 'contrib_roster_' + project_dict["project_name"] + '.json', 'w') as data_path:
2024-01-15 17:55:54 +00:00
json.dump(formatted, data_path)
2023-12-19 02:43:39 +00:00
if __name__ == "__main__":
main()