updates to main functions, data collection
This commit is contained in:
		
							parent
							
								
									fd10bdfa33
								
							
						
					
					
						commit
						d794f1b50d
					
				| @ -11,7 +11,7 @@ def main(vcs, early_cutoff): | |||||||
|     repo_name = '"' + vcs_list[-1] + '"' |     repo_name = '"' + vcs_list[-1] + '"' | ||||||
|     repo_owner = '"' + vcs_list[-2] + '"' |     repo_owner = '"' + vcs_list[-2] + '"' | ||||||
|     gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name) |     gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name) | ||||||
|     gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"].content, early_cutoff) |     gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff) | ||||||
|     return gsql_dict |     return gsql_dict | ||||||
| 
 | 
 | ||||||
| def get_discussion_gql(repo_owner, repo_name): | def get_discussion_gql(repo_owner, repo_name): | ||||||
| @ -61,17 +61,21 @@ def get_discussion_gql(repo_owner, repo_name): | |||||||
|     headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key} |     headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key} | ||||||
|     r = requests.post(url=url, data=data_json, headers=headers) |     r = requests.post(url=url, data=data_json, headers=headers) | ||||||
|     #print(r.content) |     #print(r.content) | ||||||
|     return r |     return r.json() | ||||||
| 
 | 
 | ||||||
| def within_time(comment_content, early_cutoff): | def within_time(comment_content, early_cutoff): | ||||||
|     list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"] |     try:  | ||||||
|     valid_comments = [] |         list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"] | ||||||
|     for comment in list_of_comments: |         valid_comments = [] | ||||||
|         if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff: |         for comment in list_of_comments: | ||||||
|             break |             if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff: | ||||||
|         else: |                 break | ||||||
|             valid_comments.append(comment) |             else: | ||||||
|     return valid_comments |                 valid_comments.append(comment) | ||||||
|  |         return valid_comments | ||||||
|  |     except TypeError: | ||||||
|  |         print("no discussions found") | ||||||
|  |         return [] | ||||||
|      |      | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|  | |||||||
| @ -21,7 +21,8 @@ def get_milestone_information(repo_uri): | |||||||
|     repo_uri_list = repo_uri.split('/') |     repo_uri_list = repo_uri.split('/') | ||||||
|     print(repo_uri_list) |     print(repo_uri_list) | ||||||
|     api_url = "https://api.github.com/repos/" + repo_uri_list[-2]  + "/" + repo_uri_list[-1] + "/milestones" |     api_url = "https://api.github.com/repos/" + repo_uri_list[-2]  + "/" + repo_uri_list[-1] + "/milestones" | ||||||
|     response = requests.get(api_url) |     headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key} | ||||||
|  |     response = requests.get(url = api_url, headers=headers) | ||||||
|     response_dict = response.json() |     response_dict = response.json() | ||||||
|     return response_dict |     return response_dict | ||||||
| 
 | 
 | ||||||
| @ -32,9 +33,13 @@ def parse_milestones(milestones, earliest_date): | |||||||
|         # TODO: decide whether to use created_at or updated_at or closed_at |         # TODO: decide whether to use created_at or updated_at or closed_at | ||||||
|         # problem is that no one closes their milestones?! hardly seems representative?! |         # problem is that no one closes their milestones?! hardly seems representative?! | ||||||
|         # making this a note here, as both Tamburri and van Meijel use 'closed_at' |         # making this a note here, as both Tamburri and van Meijel use 'closed_at' | ||||||
|         if entry['updated_at'] != None: |         print(entry) | ||||||
|             if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date: |         try: | ||||||
|                 count_of_milestones += 1 |             if entry['updated_at'] != None: | ||||||
|  |                 if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date: | ||||||
|  |                     count_of_milestones += 1 | ||||||
|  |         except TypeError: | ||||||
|  |             print("string indices error? or I think maybe they just don't use milestones") | ||||||
|     return count_of_milestones |     return count_of_milestones | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										37
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										37
									
								
								main.py
									
									
									
									
									
								
							| @ -2,18 +2,20 @@ import perceval | |||||||
| import os | import os | ||||||
| import yaml | import yaml | ||||||
| import datetime as dt | import datetime as dt | ||||||
| 
 | import json | ||||||
|  | #functions from other files | ||||||
| import perceval_tasks as pt | import perceval_tasks as pt | ||||||
| import github_api_req as gha | import github_api_req as gha | ||||||
| import gh_gsql_req as ghs | import gh_gsql_req as ghs | ||||||
| 
 | 
 | ||||||
| def main(): | def main(): | ||||||
|     # we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window |     # we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window | ||||||
|     early_cutoff = dt.datetime(2023,10, 11) |     early_cutoff = dt.datetime(2013,11, 6) | ||||||
|     print("Earliest date examined: " + str(early_cutoff)) |     print("Earliest date examined: " + str(early_cutoff)) | ||||||
|     largest_object = {} |     largest_object = {} | ||||||
|     #manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml' |     #manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml' | ||||||
|     directory='../kaylea_dissertation/lifecycle/package_metadata/' |     directory='../kaylea_dissertation/lifecycle/package_metadata/' | ||||||
|  |     count_of_dir = 0 | ||||||
|     for filename in os.listdir(directory): |     for filename in os.listdir(directory): | ||||||
|         f = os.path.join(directory, filename) |         f = os.path.join(directory, filename) | ||||||
|         # checking if it is a file |         # checking if it is a file | ||||||
| @ -21,8 +23,15 @@ def main(): | |||||||
|             print(f)    |             print(f)    | ||||||
|         get_everything(f, largest_object, early_cutoff) |         get_everything(f, largest_object, early_cutoff) | ||||||
|         #remove this and it should just run? for the most part at least I think |         #remove this and it should just run? for the most part at least I think | ||||||
|         break |         count_of_dir += 1 | ||||||
|  |         if count_of_dir > 2: | ||||||
|  |             break | ||||||
|     print(largest_object.keys()) |     print(largest_object.keys()) | ||||||
|  |     print(len(largest_object.keys())) | ||||||
|  |     for repo in largest_object: | ||||||
|  |         print(largest_object[repo]['new_formality']) | ||||||
|  |     with open('result.json', 'w') as results_path: | ||||||
|  |         json.dump(largest_object, results_path) | ||||||
| 
 | 
 | ||||||
| def get_everything(manifest_path, largest_object, early_cutoff): | def get_everything(manifest_path, largest_object, early_cutoff): | ||||||
|     with open(manifest_path, 'r') as stream: |     with open(manifest_path, 'r') as stream: | ||||||
| @ -30,24 +39,30 @@ def get_everything(manifest_path, largest_object, early_cutoff): | |||||||
|             config = yaml.safe_load(stream) |             config = yaml.safe_load(stream) | ||||||
|             #below lines will probably need to be refactored as tasks expand |             #below lines will probably need to be refactored as tasks expand | ||||||
|             vcs_path = config['Upstream_VCS'] |             vcs_path = config['Upstream_VCS'] | ||||||
|             #print("------------------") |             print("------------------") | ||||||
|             #print(vcs_path) |             #print(vcs_path) | ||||||
|             repo_path = vcs_path[0] |             repo_path = vcs_path[0] | ||||||
|             largest_object[repo_path] = {} |             largest_object[repo_path] = {} | ||||||
|             largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff) |             largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff) | ||||||
|  |             if len(largest_object[repo_path]["perceval_obj"]) == 0: | ||||||
|  |                 print("PERCEVAL ERROR") | ||||||
|  |                 del largest_object[repo_path] | ||||||
|  |                 return | ||||||
|             largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff) |             largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff) | ||||||
|             #these are the two variables in the denominator of the formality measure |             ''' | ||||||
|             #print("Age of Project: " + str(largest_object[repo_path]["perceval_obj"]['age_of_project'])) |             if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0: | ||||||
|             #print('Contributor Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['contributors']))) |                 #del largest_object[repo_path] | ||||||
|             #print('Collaborator Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['collaborators']))) |                 #return | ||||||
|             #print('Number of Milestones: ' + str(largest_object[repo_path]["gha_obj"]['milestone_count'])) |                 #this is to ensure that projects which don't use milestones are counted | ||||||
|             largest_object[repo_path]['new_mmt'] = compute_new_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators'])) |                 largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1 | ||||||
|  |             largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators']) | ||||||
|             #print('New MMT: ' + str(largest_object[repo_path]['new_mmt'])) |             #print('New MMT: ' + str(largest_object[repo_path]['new_mmt'])) | ||||||
|             largest_object[repo_path]['old_mmt'] = compute_old_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators'])) |             largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators']) | ||||||
|             #print('Old MMT: ' + str(largest_object[repo_path]['old_mmt'])) |             #print('Old MMT: ' + str(largest_object[repo_path]['old_mmt'])) | ||||||
|             #new mmt formality score |             #new mmt formality score | ||||||
|             largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project']) |             largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project']) | ||||||
|             print(largest_object[repo_path]['new_formality']) |             print(largest_object[repo_path]['new_formality']) | ||||||
|  |             ''' | ||||||
|             # testing out beneath: |             # testing out beneath: | ||||||
|             largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff) |             largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff) | ||||||
|             #print(ghs_obj["time_cleaned_comm"]) |             #print(ghs_obj["time_cleaned_comm"]) | ||||||
|  | |||||||
| @ -3,26 +3,43 @@ from perceval.backends.core.git import Git | |||||||
| import argparse | import argparse | ||||||
| 
 | 
 | ||||||
| #globals | #globals | ||||||
| repo_dir = '/tmp/' | #repo_dir = '/tmp/' | ||||||
| 
 | 
 | ||||||
| #main function for all subsequent tasks using perceval  | #main function for all subsequent tasks using perceval  | ||||||
| def main(vcs_path, begin_date): | def main(vcs_path, begin_date): | ||||||
|     perceval_info = {} |     perceval_info = {} | ||||||
|     perceval_info['list_of_commits'] = get_perceval_log(vcs_path, begin_date) |     perceval_info['list_of_commits'] = get_perceval_log(vcs_path, begin_date) | ||||||
|     perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) |     if len(perceval_info['list_of_commits']) > 0: | ||||||
|     perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits']) |         perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) | ||||||
|     return perceval_info |         perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits']) | ||||||
|  |         del perceval_info['list_of_commits'] | ||||||
|  |         return perceval_info | ||||||
|  |     else: | ||||||
|  |         print('error, no commits found?') | ||||||
|  |         return {} | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # this is the primary function for getting the list of commits from perceval | # this is the primary function for getting the list of commits from perceval | ||||||
| def get_perceval_log(vcs_path, begin_date): | def get_perceval_log(vcs_path, begin_date): | ||||||
|     print(vcs_path) |     print(vcs_path) | ||||||
|     repo_dir = '/tmp/' + str(vcs_path[0].split('/')[-1]) |     repo_dir = '/Users/mgone/Desktop/tmp/' + str(vcs_path[0].split('/')[-1]) | ||||||
|     #gitpath=repo_dir |     try: | ||||||
|  |         #gitpath=repo_dir | ||||||
|  |         repo = Git(uri=vcs_path[0], gitpath=repo_dir) | ||||||
|  |         # this is a temporary date_from, will need to be more inclusive in the future | ||||||
|  |         fetched_commits = repo.fetch(from_date=begin_date) | ||||||
|  |         return list(fetched_commits) | ||||||
|  |     except:  | ||||||
|  |         print("error, cannot fetch repo data?") | ||||||
|  |         return {} | ||||||
|  |     ''' | ||||||
|  |      #gitpath=repo_dir | ||||||
|     repo = Git(uri=vcs_path[0], gitpath=repo_dir) |     repo = Git(uri=vcs_path[0], gitpath=repo_dir) | ||||||
|     # this is a temporary date_from, will need to be more inclusive in the future |     # this is a temporary date_from, will need to be more inclusive in the future | ||||||
|     fetched_commits = repo.fetch(from_date=begin_date) |     fetched_commits = repo.fetch(from_date=begin_date) | ||||||
|     return list(fetched_commits) |     return list(fetched_commits) | ||||||
|  |     ''' | ||||||
| 
 | 
 | ||||||
| #this function is just to evaluate the repository age, as defined by Tamburri and used by van Meijel | #this function is just to evaluate the repository age, as defined by Tamburri and used by van Meijel | ||||||
| def get_repo_age(all_commits): | def get_repo_age(all_commits): | ||||||
| @ -34,6 +51,7 @@ def get_repo_age(all_commits): | |||||||
|     project_life = last_date - first_date |     project_life = last_date - first_date | ||||||
|     return project_life.total_seconds() / 86400 |     return project_life.total_seconds() / 86400 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| #attempt at getting the rosters, though need to make sure that we can get the MR | #attempt at getting the rosters, though need to make sure that we can get the MR | ||||||
| def get_all_actors(all_commits): | def get_all_actors(all_commits): | ||||||
|     #collaborators are more senior than contributors, doing it by author/commit |     #collaborators are more senior than contributors, doing it by author/commit | ||||||
| @ -50,5 +68,5 @@ def get_all_actors(all_commits): | |||||||
|     for committer in committers: |     for committer in committers: | ||||||
|         if committer in authors: |         if committer in authors: | ||||||
|             authors.remove(committer) |             authors.remove(committer) | ||||||
|     return authors, committers |     return len(authors), len(committers) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user