misc fire remediations for scraper

This commit is contained in:
Matthew Gaughan 2023-12-12 11:05:22 -06:00
parent 154a8b9d92
commit 7aa3af05ea
3 changed files with 22 additions and 6 deletions

View File

@ -11,7 +11,7 @@ import debian_queries as dqs
key = os.environ.get('KKEXKEY') key = os.environ.get('KKEXKEY')
test_csv_path = "121123_expanded_data.csv" test_csv_path = "121223_expanded_data.csv"
def main(): def main():
early_cutoff = dt.datetime(2008,2, 8) early_cutoff = dt.datetime(2008,2, 8)
@ -43,16 +43,20 @@ def main():
if project_dict["upstream_vcs_link"] == "": if project_dict["upstream_vcs_link"] == "":
no_upstream += 1 no_upstream += 1
continue continue
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) try:
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff)
except:
print("perceval timeout")
continue
if perceval_data == {}: if perceval_data == {}:
continue continue
project_rosters = {} project_rosters = {}
project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list'] project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list']
with open("/data/users/mgaughan/kkex_roster_data_121123/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path: with open("/data/users/mgaughan/kkex_roster_data_121223/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path:
json.dump(project_rosters, roster_path) json.dump(project_rosters, roster_path)
if "github" in project_dict["upstream_vcs_link"]: if "github" in project_dict["upstream_vcs_link"]:
project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff) project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff)
with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: with open('/data/users/mgaughan/kkex_comment_data_121223/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path) json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path)
else: else:
project_dict['milestone_count'] = 0 project_dict['milestone_count'] = 0
@ -64,7 +68,7 @@ def main():
meta_dict['total_success'] = successful_count meta_dict['total_success'] = successful_count
meta_dict['no_upstream_info'] = no_upstream meta_dict['no_upstream_info'] = no_upstream
#print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) #print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
with open('121123_metadata_expanded.json', 'w') as data_path: with open('121223_metadata_expanded.json', 'w') as data_path:
json.dump(meta_dict, data_path) json.dump(meta_dict, data_path)

View File

@ -2,10 +2,14 @@ import requests
import datetime as dt import datetime as dt
import json import json
import os import os
from wrapt_timeout_decorator import *
import time
key = os.environ.get('KKEXKEY') key = os.environ.get('KKEXKEY')
def main(vcs, early_cutoff): def main(vcs, early_cutoff):
time.sleep(1)
gsql_dict = {} gsql_dict = {}
vcs_list = vcs.split('/') vcs_list = vcs.split('/')
repo_name = '"' + vcs_list[-1] + '"' repo_name = '"' + vcs_list[-1] + '"'
@ -28,6 +32,7 @@ def get_discussion_gql(repo_owner, repo_name):
number number
state state
author { author {
login
url url
} }
labels(first:5) { labels(first:5) {
@ -41,8 +46,10 @@ def get_discussion_gql(repo_owner, repo_name):
# edges.node is where the actual `Comment` object is # edges.node is where the actual `Comment` object is
edges { edges {
node { node {
id
author { author {
avatarUrl login
url
} }
body body
} }

View File

@ -1,6 +1,8 @@
import datetime as dt import datetime as dt
from perceval.backends.core.git import Git from perceval.backends.core.git import Git
import argparse import argparse
from wrapt_timeout_decorator import *
import json
#globals #globals
#repo_dir = '/tmp/' #repo_dir = '/tmp/'
@ -12,6 +14,8 @@ def main(vcs_path, begin_date):
if len(perceval_info['list_of_commits']) > 0: if len(perceval_info['list_of_commits']) > 0:
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
perceval_info['contributors'], perceval_info['collaborators'], perceval_info['contributors_list'], perceval_info['collaborators_list'] = get_all_actors(perceval_info['list_of_commits']) perceval_info['contributors'], perceval_info['collaborators'], perceval_info['contributors_list'], perceval_info['collaborators_list'] = get_all_actors(perceval_info['list_of_commits'])
with open("/data/users/mgaughan/kkex_commit_data_121223/" + "commits_" + vcs_path.split('/')[-1] + '.json', 'w') as commits_path:
json.dump(perceval_info, commits_path)
del perceval_info['list_of_commits'] del perceval_info['list_of_commits']
print(perceval_info) print(perceval_info)
return perceval_info return perceval_info
@ -22,6 +26,7 @@ def main(vcs_path, begin_date):
# this is the primary function for getting the list of commits from perceval # this is the primary function for getting the list of commits from perceval
@timeout(600, use_signals=False)
def get_perceval_log(vcs_path, begin_date): def get_perceval_log(vcs_path, begin_date):
vcs_path = vcs_path.strip() vcs_path = vcs_path.strip()
print(vcs_path) print(vcs_path)