24_deb_pkg_gov/pr_data_get.py

171 lines
7.7 KiB
Python

import csv
from perceval.backends.core.git import Git
import os
import datetime as dt
import time
import shutil
import pandas as pd
import dateutil
from tqdm import tqdm
import math
key = os.environ.get('KKEXKEY')
early_cutoff = dt.datetime(2008,2, 8)
temp_dir = "/data/users/mgaughan/tmp"
'''
- rate of change, rate of PRs/day
'''
def file_get_pr(upstream_vcs_link, me_read):
# if we're looking at readmes me_read is true and if not, if we're looking at contributing files, it's false
#this is the window of days on either side of the event that we're looking at
window = 182
#print(upstream_vcs_link.split('/')[4])
project_dict = {}
project_dict['upstream_vcs_link'] = upstream_vcs_link
if upstream_vcs_link == " https://gitlab.com/ubports/core/cmake-extras":
return {}
upstream_vcs_link = upstream_vcs_link.strip()
if "github" in upstream_vcs_link or "gitlab" in upstream_vcs_link:
#making an evaluation that sub branches aren't being used and that people would fork if needed
#this only looks at main
upstream_vcs_link = "/".join(upstream_vcs_link.split("/")[0:5])
print(upstream_vcs_link)
full_temp_path = temp_dir + upstream_vcs_link.split('/')[4] + ".git"
else:
full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git"
print(upstream_vcs_link)
repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path)
try:
commits = repo.fetch()
except:
print("perceval issue")
return
has_readme = False
has_contributing = False
merge_pre_rm, merge_post_rm, merge_pre_cont, merge_post_cont = 0, 0, 0, 0
#list of tuples which has date and whether it was a merge
commit_list = []
first_date_readme = ""
for commit in commits:
#print(commit['data'])
if "Merge" in commit['data'].keys():
commit_list.append([commit['data']['CommitDate'], True, commit['data']['Author'], commit['data']['Commit']])
if has_contributing:
merge_post_cont += 1
else:
merge_pre_cont += 1
else:
commit_list.append([commit['data']['CommitDate'], False, commit['data']['Author'], commit['data']['Commit']])
files = commit['data']['files']
#print(commit['data']['CommitDate'])
#print(type(dateutil.parser.parse(commit['data']['CommitDate'])))
for file in files:
if "CONTRIBUTING" in file['file'] and has_contributing == False:
has_contributing = True
first_date_contributing = dateutil.parser.parse(commit['data']['CommitDate'])
if "README" in file['file'] and has_readme == False:
has_readme = True
first_date_readme = dateutil.parser.parse(commit['data']['CommitDate'])
project_dict['readme_commit_hash'] = commit['data']['commit']
shutil.rmtree(full_temp_path, ignore_errors=True)
if first_date_readme == "":
return {}
if me_read:
project_dict['first_readme'] = first_date_readme
before_read = pr_count(first_date_readme+ dt.timedelta(days=-window, hours=0), first_date_readme, commit_list, [], [])
if before_read != None:
project_dict['before_prs_read'] = before_read[0]
project_dict['before_mrg_read'] = before_read[1]
project_dict['before_auth_new'] = before_read[2]
project_dict['before_commit_new'] = before_read[3]
else:
return {}
after_read = pr_count(first_date_readme, first_date_readme + dt.timedelta(days=window, hours=0), commit_list, before_read[4], before_read[5])
if after_read != None:
project_dict['after_prs_read'] = after_read[0]
project_dict['after_mrg_read'] = after_read[1]
project_dict['after_auth_new'] = after_read[2]
project_dict['after_commit_new'] = after_read[3]
else:
return {}
else:
project_dict['first_contributing'] = first_date_contributing
before_cont = pr_count(first_date_contributing + dt.timedelta(days=-window, hours=0), first_date_contributing, commit_list, [], [])
if before_cont != None:
project_dict['before_prs_cont'] = before_cont[0]
project_dict['before_mrg_cont'] = before_cont[1]
project_dict['before_auth_new'] = before_cont[2]
project_dict['before_commit_new'] = before_cont[3]
else:
return {}
after_cont = pr_count(first_date_contributing, first_date_contributing + dt.timedelta(days=window, hours=0), commit_list, before_cont[4], before_cont[5])
if after_cont != None:
project_dict['after_prs_cont'] = after_cont[0]
project_dict['after_mrg_cont'] = after_cont[1]
project_dict['after_auth_new'] = after_cont[2]
project_dict['after_commit_new'] = after_cont[3]
else:
return {}
print(project_dict)
return project_dict
#TODO: pr_count should return an array of values for weekly/6mo
def pr_count(start, end, commits, author_roster, commit_roster):
count = 0
merge_count = 0
by_week = [0] * 27
by_week_merge =[0] * 27
current_week = 0
new_authors = 0
new_committers = 0
for commit in tqdm(commits):
if dateutil.parser.parse(commit[0]) <= start:
if commit[2] not in author_roster:
author_roster.append(commit[2])
if commit[1] and commit[3] not in commit_roster:
commit_roster.append(commit[3])
if dateutil.parser.parse(commit[0]) > start:
if math.floor((dateutil.parser.parse(commit[0]) - start).days / 7) <= 26:
by_week[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
if commit[1]:
by_week_merge[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
if commit[3] not in commit_roster:
new_committers += 1
#remaining question of whether to make this the author of the merge commit[2] or the committer of the merge commit[3]
commit_roster.append(commit[3])
if commit[2] not in author_roster:
new_authors += 1
author_roster.append(commit[2])
if dateutil.parser.parse(commit[0]) > end:
print(len(by_week))
return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster]
def for_files():
csv_path = "final_data/kk_final_readme_roster.csv"
count = 0
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
with open('kk_test_031424_pr_did.csv', "w") as writing_file:
# this would also have to get switched fro the cont dataset
keys = ['upstream_vcs_link', "first_readme", "before_prs_read", "before_mrg_read", "after_prs_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
dict_writer = csv.DictWriter(writing_file, keys)
dict_writer.writeheader()
for row in csv_reader:
count += 1
print(row['upstream_vcs_link'])
# this would have to get switched to false for the cont dataset
dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True)
dict_writer.writerow(dict_row)
if __name__ == "__main__":
#for_files()
file_get_pr("https://github.com/tqdm/tqdm", True)
#file_get_pr("https://github.com/GameServerManagers/LinuxGSM", True)
#file_get_pr("https://github.com/walling/unorm/issues/new/", True)
file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True)