final prep for expanded data collection a1
This commit is contained in:
		
							parent
							
								
									ab763bcc13
								
							
						
					
					
						commit
						49a02dfca4
					
				| @ -3,7 +3,7 @@ import requests | |||||||
| import os | import os | ||||||
| import datetime as dt | import datetime as dt | ||||||
| import wget | import wget | ||||||
| 
 | import json | ||||||
| import perceval_tasks as pt | import perceval_tasks as pt | ||||||
| import github_api_req as gha | import github_api_req as gha | ||||||
| import gh_gsql_req as ghs | import gh_gsql_req as ghs | ||||||
| @ -11,19 +11,24 @@ import debian_queries as dqs | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| key = os.environ.get('KKEXKEY') | key = os.environ.get('KKEXKEY') | ||||||
| test_csv_path = "120523_data_test.csv" | test_csv_path = "120523_expanded_data_test.csv" | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| def main(): | def main(): | ||||||
|     early_cutoff = dt.datetime(2008,2, 8) |     early_cutoff = dt.datetime(2008,2, 8) | ||||||
|  |     meta_dict = {} | ||||||
|  |     with open(test_csv_path, 'w', newline='') as output_file: | ||||||
|  |         keys = ["project_name", "underproduction_mean", "underproduction_low", "underproduction_high", "debian_vcs_link", "upstream_vcs_link", "age_of_project", "contributors", "collaborators", "milestone_count"] | ||||||
|  |         dict_writer = csv.DictWriter(output_file, keys) | ||||||
|  |         dict_writer.writeheader() | ||||||
|         with open('inst_all_packages_full_results.csv', newline='') as csvfile: |         with open('inst_all_packages_full_results.csv', newline='') as csvfile: | ||||||
|             array_of_projects =[] |             array_of_projects =[] | ||||||
|             spamreader = csv.reader(csvfile) |             spamreader = csv.reader(csvfile) | ||||||
|             index = 0 |             index = 0 | ||||||
|             successful_count = 0 |             successful_count = 0 | ||||||
|  |             no_upstream = 0 | ||||||
|             for row in spamreader: |             for row in spamreader: | ||||||
|                 index += 1 |                 index += 1 | ||||||
|             if index > 20: |                 if index > 10: | ||||||
|                     break |                     break | ||||||
|                 project_dict = {} |                 project_dict = {} | ||||||
|                 project_dict["project_name"] = row[0] |                 project_dict["project_name"] = row[0] | ||||||
| @ -38,20 +43,28 @@ def main(): | |||||||
|                 else: |                 else: | ||||||
|                     project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"]) |                     project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"]) | ||||||
|                 if project_dict["upstream_vcs_link"] == "": |                 if project_dict["upstream_vcs_link"] == "": | ||||||
|  |                     no_upstream += 1 | ||||||
|                     continue |                     continue | ||||||
|                 perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) |                 perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) | ||||||
|                 if perceval_data == {}: |                 if perceval_data == {}: | ||||||
|                     continue |                     continue | ||||||
|             project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'] |                 project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'] | ||||||
|  |                 if "github" in project_dict["upstream_vcs_link"]: | ||||||
|  |                     project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff) | ||||||
|  |                     with open('/data/users/mgaughan/kkex_comment_data_120523/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: | ||||||
|  |                         json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path) | ||||||
|  |                 else: | ||||||
|  |                     project_dict['milestone_count'] = 0 | ||||||
|                 successful_count += 1 |                 successful_count += 1 | ||||||
|                 if index > 1: |                 if index > 1: | ||||||
|                 array_of_projects.append(project_dict) |                     dict_writer.writerow(project_dict) | ||||||
|         print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) |     meta_dict['success_rate'] = successful_count/index | ||||||
|         keys = array_of_projects[0].keys() |     meta_dict['final_index'] = index | ||||||
|     with open(test_csv_path, 'w', newline='') as output_file: |     meta_dict['total_success'] = successful_count | ||||||
|         dict_writer = csv.DictWriter(output_file, keys) |     meta_dict['no_upstream_info'] = no_upstream | ||||||
|         dict_writer.writeheader() |     #print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) | ||||||
|         dict_writer.writerows(array_of_projects) |     with open('120523_metadata_expanded.json', 'w') as data_path: | ||||||
|  |         json.dump(meta_dict, data_path) | ||||||
|              |              | ||||||
| 
 | 
 | ||||||
| def clean_gh_vcs_link(debian_vcs_link): | def clean_gh_vcs_link(debian_vcs_link): | ||||||
|  | |||||||
| @ -7,12 +7,12 @@ key = os.environ.get('KKEXKEY') | |||||||
| 
 | 
 | ||||||
| def main(vcs, early_cutoff): | def main(vcs, early_cutoff): | ||||||
|     gsql_dict = {} |     gsql_dict = {} | ||||||
|     vcs_list = vcs[0].split('/') |     vcs_list = vcs.split('/') | ||||||
|     repo_name = '"' + vcs_list[-1] + '"' |     repo_name = '"' + vcs_list[-1] + '"' | ||||||
|     repo_owner = '"' + vcs_list[-2] + '"' |     repo_owner = '"' + vcs_list[-2] + '"' | ||||||
|     gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name) |     gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name) | ||||||
|     gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff) |     gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff) | ||||||
|     return gsql_dict |     return gsql_dict['original_returned_content'] | ||||||
| 
 | 
 | ||||||
| def get_discussion_gql(repo_owner, repo_name): | def get_discussion_gql(repo_owner, repo_name): | ||||||
|     url = "https://api.github.com/graphql" |     url = "https://api.github.com/graphql" | ||||||
|  | |||||||
| @ -6,14 +6,14 @@ import os | |||||||
| key = os.environ.get('KKEXKEY') | key = os.environ.get('KKEXKEY') | ||||||
| 
 | 
 | ||||||
| def main(vcs, begin_date): | def main(vcs, begin_date): | ||||||
|     repo_uri=vcs[0] |     repo_uri=vcs | ||||||
|     gha_info = {} |     gha_info = {} | ||||||
|     #this is the entire list of Github 'milestones' grabbed from the API |     #this is the entire list of Github 'milestones' grabbed from the API | ||||||
|     gha_info['milestones'] = get_milestone_information(repo_uri) |     gha_info['milestones'] = get_milestone_information(repo_uri) | ||||||
|     #this is the count of milestones that occur after the cutoff date |     #this is the count of milestones that occur after the cutoff date | ||||||
|     gha_info['milestone_count'] = parse_milestones(gha_info['milestones'], begin_date) |     gha_info['milestone_count'] = parse_milestones(gha_info['milestones'], begin_date) | ||||||
|     #split_actors(repo_uri, actors_list) |     #split_actors(repo_uri, actors_list) | ||||||
|     return gha_info |     return gha_info['milestone_count'] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| #this simple API call has been working for now but may need to be updated as more information is desired | #this simple API call has been working for now but may need to be updated as more information is desired | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user