misc fire remediations for scraper

This commit is contained in:
Matthew Gaughan 2023-12-12 11:05:22 -06:00
parent 154a8b9d92
commit 7aa3af05ea
3 changed files with 22 additions and 6 deletions

View File

@ -11,7 +11,7 @@ import debian_queries as dqs
key = os.environ.get('KKEXKEY')
test_csv_path = "121123_expanded_data.csv"
test_csv_path = "121223_expanded_data.csv"
def main():
early_cutoff = dt.datetime(2008,2, 8)
@ -43,16 +43,20 @@ def main():
if project_dict["upstream_vcs_link"] == "":
no_upstream += 1
continue
try:
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff)
except:
print("perceval timeout")
continue
if perceval_data == {}:
continue
project_rosters = {}
project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list']
with open("/data/users/mgaughan/kkex_roster_data_121123/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path:
with open("/data/users/mgaughan/kkex_roster_data_121223/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path:
json.dump(project_rosters, roster_path)
if "github" in project_dict["upstream_vcs_link"]:
project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff)
with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
with open('/data/users/mgaughan/kkex_comment_data_121223/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path)
else:
project_dict['milestone_count'] = 0
@ -64,7 +68,7 @@ def main():
meta_dict['total_success'] = successful_count
meta_dict['no_upstream_info'] = no_upstream
#print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
with open('121123_metadata_expanded.json', 'w') as data_path:
with open('121223_metadata_expanded.json', 'w') as data_path:
json.dump(meta_dict, data_path)

View File

@ -2,10 +2,14 @@ import requests
import datetime as dt
import json
import os
from wrapt_timeout_decorator import *
import time
key = os.environ.get('KKEXKEY')
def main(vcs, early_cutoff):
time.sleep(1)
gsql_dict = {}
vcs_list = vcs.split('/')
repo_name = '"' + vcs_list[-1] + '"'
@ -28,6 +32,7 @@ def get_discussion_gql(repo_owner, repo_name):
number
state
author {
login
url
}
labels(first:5) {
@ -41,8 +46,10 @@ def get_discussion_gql(repo_owner, repo_name):
# edges.node is where the actual `Comment` object is
edges {
node {
id
author {
avatarUrl
login
url
}
body
}

View File

@ -1,6 +1,8 @@
import datetime as dt
from perceval.backends.core.git import Git
import argparse
from wrapt_timeout_decorator import *
import json
#globals
#repo_dir = '/tmp/'
@ -12,6 +14,8 @@ def main(vcs_path, begin_date):
if len(perceval_info['list_of_commits']) > 0:
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
perceval_info['contributors'], perceval_info['collaborators'], perceval_info['contributors_list'], perceval_info['collaborators_list'] = get_all_actors(perceval_info['list_of_commits'])
with open("/data/users/mgaughan/kkex_commit_data_121223/" + "commits_" + vcs_path.split('/')[-1] + '.json', 'w') as commits_path:
json.dump(perceval_info, commits_path)
del perceval_info['list_of_commits']
print(perceval_info)
return perceval_info
@ -22,6 +26,7 @@ def main(vcs_path, begin_date):
# this is the primary function for getting the list of commits from perceval
@timeout(600, use_signals=False)
def get_perceval_log(vcs_path, begin_date):
vcs_path = vcs_path.strip()
print(vcs_path)