misc fire remediations for scraper
This commit is contained in:
		
							parent
							
								
									154a8b9d92
								
							
						
					
					
						commit
						7aa3af05ea
					
				| @ -11,7 +11,7 @@ import debian_queries as dqs | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| key = os.environ.get('KKEXKEY') | key = os.environ.get('KKEXKEY') | ||||||
| test_csv_path = "121123_expanded_data.csv" | test_csv_path = "121223_expanded_data.csv" | ||||||
| 
 | 
 | ||||||
| def main(): | def main(): | ||||||
|     early_cutoff = dt.datetime(2008,2, 8) |     early_cutoff = dt.datetime(2008,2, 8) | ||||||
| @ -43,16 +43,20 @@ def main(): | |||||||
|                 if project_dict["upstream_vcs_link"] == "": |                 if project_dict["upstream_vcs_link"] == "": | ||||||
|                     no_upstream += 1 |                     no_upstream += 1 | ||||||
|                     continue |                     continue | ||||||
|                 perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) |                 try: | ||||||
|  |                     perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) | ||||||
|  |                 except: | ||||||
|  |                     print("perceval timeout") | ||||||
|  |                     continue | ||||||
|                 if perceval_data == {}: |                 if perceval_data == {}: | ||||||
|                     continue |                     continue | ||||||
|                 project_rosters = {} |                 project_rosters = {} | ||||||
|                 project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list'] |                 project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list'] | ||||||
|                 with open("/data/users/mgaughan/kkex_roster_data_121123/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path: |                 with open("/data/users/mgaughan/kkex_roster_data_121223/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path: | ||||||
|                     json.dump(project_rosters, roster_path) |                     json.dump(project_rosters, roster_path) | ||||||
|                 if "github" in project_dict["upstream_vcs_link"]: |                 if "github" in project_dict["upstream_vcs_link"]: | ||||||
|                     project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff) |                     project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff) | ||||||
|                     with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: |                     with open('/data/users/mgaughan/kkex_comment_data_121223/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: | ||||||
|                         json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path) |                         json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path) | ||||||
|                 else: |                 else: | ||||||
|                     project_dict['milestone_count'] = 0 |                     project_dict['milestone_count'] = 0 | ||||||
| @ -64,7 +68,7 @@ def main(): | |||||||
|     meta_dict['total_success'] = successful_count |     meta_dict['total_success'] = successful_count | ||||||
|     meta_dict['no_upstream_info'] = no_upstream |     meta_dict['no_upstream_info'] = no_upstream | ||||||
|     #print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) |     #print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) | ||||||
|     with open('121123_metadata_expanded.json', 'w') as data_path: |     with open('121223_metadata_expanded.json', 'w') as data_path: | ||||||
|         json.dump(meta_dict, data_path) |         json.dump(meta_dict, data_path) | ||||||
|              |              | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -2,10 +2,14 @@ import requests | |||||||
| import datetime as dt | import datetime as dt | ||||||
| import json | import json | ||||||
| import os | import os | ||||||
|  | from wrapt_timeout_decorator import * | ||||||
|  | import time  | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| key = os.environ.get('KKEXKEY') | key = os.environ.get('KKEXKEY') | ||||||
| 
 | 
 | ||||||
| def main(vcs, early_cutoff): | def main(vcs, early_cutoff): | ||||||
|  |     time.sleep(1) | ||||||
|     gsql_dict = {} |     gsql_dict = {} | ||||||
|     vcs_list = vcs.split('/') |     vcs_list = vcs.split('/') | ||||||
|     repo_name = '"' + vcs_list[-1] + '"' |     repo_name = '"' + vcs_list[-1] + '"' | ||||||
| @ -28,6 +32,7 @@ def get_discussion_gql(repo_owner, repo_name): | |||||||
|                         number |                         number | ||||||
|                         state |                         state | ||||||
|                         author { |                         author { | ||||||
|  |                         login | ||||||
|                         url |                         url | ||||||
|                         } |                         } | ||||||
|                         labels(first:5) { |                         labels(first:5) { | ||||||
| @ -41,8 +46,10 @@ def get_discussion_gql(repo_owner, repo_name): | |||||||
|                             # edges.node is where the actual `Comment` object is |                             # edges.node is where the actual `Comment` object is | ||||||
|                             edges { |                             edges { | ||||||
|                             node { |                             node { | ||||||
|  |                                 id | ||||||
|                                 author { |                                 author { | ||||||
|                                 avatarUrl |                                     login | ||||||
|  |                                     url | ||||||
|                                 } |                                 } | ||||||
|                                 body |                                 body | ||||||
|                             } |                             } | ||||||
|  | |||||||
| @ -1,6 +1,8 @@ | |||||||
| import datetime as dt | import datetime as dt | ||||||
| from perceval.backends.core.git import Git | from perceval.backends.core.git import Git | ||||||
| import argparse | import argparse | ||||||
|  | from wrapt_timeout_decorator import * | ||||||
|  | import json | ||||||
| 
 | 
 | ||||||
| #globals | #globals | ||||||
| #repo_dir = '/tmp/' | #repo_dir = '/tmp/' | ||||||
| @ -12,6 +14,8 @@ def main(vcs_path, begin_date): | |||||||
|     if len(perceval_info['list_of_commits']) > 0: |     if len(perceval_info['list_of_commits']) > 0: | ||||||
|         perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) |         perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) | ||||||
|         perceval_info['contributors'], perceval_info['collaborators'], perceval_info['contributors_list'], perceval_info['collaborators_list'] = get_all_actors(perceval_info['list_of_commits']) |         perceval_info['contributors'], perceval_info['collaborators'], perceval_info['contributors_list'], perceval_info['collaborators_list'] = get_all_actors(perceval_info['list_of_commits']) | ||||||
|  |         with open("/data/users/mgaughan/kkex_commit_data_121223/" + "commits_" + vcs_path.split('/')[-1] + '.json', 'w') as commits_path: | ||||||
|  |             json.dump(perceval_info, commits_path) | ||||||
|         del perceval_info['list_of_commits'] |         del perceval_info['list_of_commits'] | ||||||
|         print(perceval_info) |         print(perceval_info) | ||||||
|         return perceval_info |         return perceval_info | ||||||
| @ -22,6 +26,7 @@ def main(vcs_path, begin_date): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # this is the primary function for getting the list of commits from perceval | # this is the primary function for getting the list of commits from perceval | ||||||
|  | @timeout(600, use_signals=False) | ||||||
| def get_perceval_log(vcs_path, begin_date): | def get_perceval_log(vcs_path, begin_date): | ||||||
|     vcs_path = vcs_path.strip() |     vcs_path = vcs_path.strip() | ||||||
|     print(vcs_path) |     print(vcs_path) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user