From 7aa3af05eacf34797ad6b72441db1d3da45c5acf Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 12 Dec 2023 11:05:22 -0600 Subject: [PATCH] misc fire remediations for scraper --- expanded_data_collection.py | 14 +++++++++----- gh_gsql_req.py | 9 ++++++++- perceval_tasks.py | 5 +++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/expanded_data_collection.py b/expanded_data_collection.py index 686d9f5..ca30cc8 100644 --- a/expanded_data_collection.py +++ b/expanded_data_collection.py @@ -11,7 +11,7 @@ import debian_queries as dqs key = os.environ.get('KKEXKEY') -test_csv_path = "121123_expanded_data.csv" +test_csv_path = "121223_expanded_data.csv" def main(): early_cutoff = dt.datetime(2008,2, 8) @@ -43,16 +43,20 @@ def main(): if project_dict["upstream_vcs_link"] == "": no_upstream += 1 continue - perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) + try: + perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) + except: + print("perceval timeout") + continue if perceval_data == {}: continue project_rosters = {} project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list'] - with open("/data/users/mgaughan/kkex_roster_data_121123/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path: + with open("/data/users/mgaughan/kkex_roster_data_121223/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path: json.dump(project_rosters, roster_path) if "github" in project_dict["upstream_vcs_link"]: project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff) - with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: + with open('/data/users/mgaughan/kkex_comment_data_121223/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path) else: project_dict['milestone_count'] = 0 @@ -64,7 +68,7 @@ def main(): meta_dict['total_success'] = successful_count meta_dict['no_upstream_info'] = no_upstream #print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) - with open('121123_metadata_expanded.json', 'w') as data_path: + with open('121223_metadata_expanded.json', 'w') as data_path: json.dump(meta_dict, data_path) diff --git a/gh_gsql_req.py b/gh_gsql_req.py index faa52ed..457d773 100644 --- a/gh_gsql_req.py +++ b/gh_gsql_req.py @@ -2,10 +2,14 @@ import requests import datetime as dt import json import os +from wrapt_timeout_decorator import * +import time + key = os.environ.get('KKEXKEY') def main(vcs, early_cutoff): + time.sleep(1) gsql_dict = {} vcs_list = vcs.split('/') repo_name = '"' + vcs_list[-1] + '"' @@ -28,6 +32,7 @@ def get_discussion_gql(repo_owner, repo_name): number state author { + login url } labels(first:5) { @@ -41,8 +46,10 @@ def get_discussion_gql(repo_owner, repo_name): # edges.node is where the actual `Comment` object is edges { node { + id author { - avatarUrl + login + url } body } diff --git a/perceval_tasks.py b/perceval_tasks.py index fb4a49e..24fae40 100644 --- a/perceval_tasks.py +++ b/perceval_tasks.py @@ -1,6 +1,8 @@ import datetime as dt from perceval.backends.core.git import Git import argparse +from wrapt_timeout_decorator import * +import json #globals #repo_dir = '/tmp/' @@ -12,6 +14,8 @@ def main(vcs_path, begin_date): if len(perceval_info['list_of_commits']) > 0: perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) perceval_info['contributors'], perceval_info['collaborators'], perceval_info['contributors_list'], perceval_info['collaborators_list'] = get_all_actors(perceval_info['list_of_commits']) + with open("/data/users/mgaughan/kkex_commit_data_121223/" + "commits_" + vcs_path.split('/')[-1] + '.json', 'w') as commits_path: + json.dump(perceval_info, commits_path) del perceval_info['list_of_commits'] print(perceval_info) return perceval_info @@ -22,6 +26,7 @@ def main(vcs_path, begin_date): # this is the primary function for getting the list of commits from perceval +@timeout(600, use_signals=False) def get_perceval_log(vcs_path, begin_date): vcs_path = vcs_path.strip() print(vcs_path)