misc fire remediations for scraper
This commit is contained in:
parent
154a8b9d92
commit
7aa3af05ea
@ -11,7 +11,7 @@ import debian_queries as dqs
|
|||||||
|
|
||||||
|
|
||||||
key = os.environ.get('KKEXKEY')
|
key = os.environ.get('KKEXKEY')
|
||||||
test_csv_path = "121123_expanded_data.csv"
|
test_csv_path = "121223_expanded_data.csv"
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
early_cutoff = dt.datetime(2008,2, 8)
|
early_cutoff = dt.datetime(2008,2, 8)
|
||||||
@ -43,16 +43,20 @@ def main():
|
|||||||
if project_dict["upstream_vcs_link"] == "":
|
if project_dict["upstream_vcs_link"] == "":
|
||||||
no_upstream += 1
|
no_upstream += 1
|
||||||
continue
|
continue
|
||||||
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff)
|
try:
|
||||||
|
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff)
|
||||||
|
except:
|
||||||
|
print("perceval timeout")
|
||||||
|
continue
|
||||||
if perceval_data == {}:
|
if perceval_data == {}:
|
||||||
continue
|
continue
|
||||||
project_rosters = {}
|
project_rosters = {}
|
||||||
project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list']
|
project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list']
|
||||||
with open("/data/users/mgaughan/kkex_roster_data_121123/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path:
|
with open("/data/users/mgaughan/kkex_roster_data_121223/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path:
|
||||||
json.dump(project_rosters, roster_path)
|
json.dump(project_rosters, roster_path)
|
||||||
if "github" in project_dict["upstream_vcs_link"]:
|
if "github" in project_dict["upstream_vcs_link"]:
|
||||||
project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff)
|
project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff)
|
||||||
with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
|
with open('/data/users/mgaughan/kkex_comment_data_121223/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
|
||||||
json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path)
|
json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path)
|
||||||
else:
|
else:
|
||||||
project_dict['milestone_count'] = 0
|
project_dict['milestone_count'] = 0
|
||||||
@ -64,7 +68,7 @@ def main():
|
|||||||
meta_dict['total_success'] = successful_count
|
meta_dict['total_success'] = successful_count
|
||||||
meta_dict['no_upstream_info'] = no_upstream
|
meta_dict['no_upstream_info'] = no_upstream
|
||||||
#print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
|
#print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
|
||||||
with open('121123_metadata_expanded.json', 'w') as data_path:
|
with open('121223_metadata_expanded.json', 'w') as data_path:
|
||||||
json.dump(meta_dict, data_path)
|
json.dump(meta_dict, data_path)
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,10 +2,14 @@ import requests
|
|||||||
import datetime as dt
|
import datetime as dt
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from wrapt_timeout_decorator import *
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
key = os.environ.get('KKEXKEY')
|
key = os.environ.get('KKEXKEY')
|
||||||
|
|
||||||
def main(vcs, early_cutoff):
|
def main(vcs, early_cutoff):
|
||||||
|
time.sleep(1)
|
||||||
gsql_dict = {}
|
gsql_dict = {}
|
||||||
vcs_list = vcs.split('/')
|
vcs_list = vcs.split('/')
|
||||||
repo_name = '"' + vcs_list[-1] + '"'
|
repo_name = '"' + vcs_list[-1] + '"'
|
||||||
@ -28,6 +32,7 @@ def get_discussion_gql(repo_owner, repo_name):
|
|||||||
number
|
number
|
||||||
state
|
state
|
||||||
author {
|
author {
|
||||||
|
login
|
||||||
url
|
url
|
||||||
}
|
}
|
||||||
labels(first:5) {
|
labels(first:5) {
|
||||||
@ -41,8 +46,10 @@ def get_discussion_gql(repo_owner, repo_name):
|
|||||||
# edges.node is where the actual `Comment` object is
|
# edges.node is where the actual `Comment` object is
|
||||||
edges {
|
edges {
|
||||||
node {
|
node {
|
||||||
|
id
|
||||||
author {
|
author {
|
||||||
avatarUrl
|
login
|
||||||
|
url
|
||||||
}
|
}
|
||||||
body
|
body
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
import datetime as dt
|
import datetime as dt
|
||||||
from perceval.backends.core.git import Git
|
from perceval.backends.core.git import Git
|
||||||
import argparse
|
import argparse
|
||||||
|
from wrapt_timeout_decorator import *
|
||||||
|
import json
|
||||||
|
|
||||||
#globals
|
#globals
|
||||||
#repo_dir = '/tmp/'
|
#repo_dir = '/tmp/'
|
||||||
@ -12,6 +14,8 @@ def main(vcs_path, begin_date):
|
|||||||
if len(perceval_info['list_of_commits']) > 0:
|
if len(perceval_info['list_of_commits']) > 0:
|
||||||
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
|
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
|
||||||
perceval_info['contributors'], perceval_info['collaborators'], perceval_info['contributors_list'], perceval_info['collaborators_list'] = get_all_actors(perceval_info['list_of_commits'])
|
perceval_info['contributors'], perceval_info['collaborators'], perceval_info['contributors_list'], perceval_info['collaborators_list'] = get_all_actors(perceval_info['list_of_commits'])
|
||||||
|
with open("/data/users/mgaughan/kkex_commit_data_121223/" + "commits_" + vcs_path.split('/')[-1] + '.json', 'w') as commits_path:
|
||||||
|
json.dump(perceval_info, commits_path)
|
||||||
del perceval_info['list_of_commits']
|
del perceval_info['list_of_commits']
|
||||||
print(perceval_info)
|
print(perceval_info)
|
||||||
return perceval_info
|
return perceval_info
|
||||||
@ -22,6 +26,7 @@ def main(vcs_path, begin_date):
|
|||||||
|
|
||||||
|
|
||||||
# this is the primary function for getting the list of commits from perceval
|
# this is the primary function for getting the list of commits from perceval
|
||||||
|
@timeout(600, use_signals=False)
|
||||||
def get_perceval_log(vcs_path, begin_date):
|
def get_perceval_log(vcs_path, begin_date):
|
||||||
vcs_path = vcs_path.strip()
|
vcs_path = vcs_path.strip()
|
||||||
print(vcs_path)
|
print(vcs_path)
|
||||||
|
Loading…
Reference in New Issue
Block a user