24_deb_pkg_gov/pr_data_get.py

119 lines
4.9 KiB
Python

import csv
from perceval.backends.core.git import Git
import os
import datetime as dt
import time
import shutil
import pandas as pd
import dateutil
from tqdm import tqdm
import math
key = os.environ.get('KKEXKEY')
early_cutoff = dt.datetime(2008,2, 8)
temp_dir = "/data/users/mgaughan/tmp"
'''
- rate of change, rate of PRs/day
'''
def file_get_pr(upstream_vcs_link, me_read):
# if we're looking at readmes me_read is true and if not, if we're looking at contributing files, it's false
#this is the window of days on either side of the event that we're looking at
window = 182
#print(upstream_vcs_link.split('/')[4])
project_dict = {}
project_dict['upstream_vcs_link'] = upstream_vcs_link
full_temp_path = temp_dir + upstream_vcs_link.split('/')[4] + ".git"
repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path)
try:
commits = repo.fetch()
except:
print("perceval issue")
return
has_readme = False
has_contributing = False
merge_pre_rm, merge_post_rm, merge_pre_cont, merge_post_cont = 0, 0, 0, 0
#list of tuples which has date and whether it was a merge
commit_list = []
for commit in commits:
if "Merge" in commit['data'].keys():
commit_list.append([commit['data']['CommitDate'], True])
if has_contributing:
merge_post_cont += 1
else:
merge_pre_cont += 1
else:
commit_list.append([commit['data']['CommitDate'], False])
files = commit['data']['files']
#print(commit['data']['CommitDate'])
#print(type(dateutil.parser.parse(commit['data']['CommitDate'])))
for file in files:
if "CONTRIBUTING" in file['file'] and has_contributing == False:
has_contributing = True
first_date_contributing = dateutil.parser.parse(commit['data']['CommitDate'])
if "README" in file['file'] and has_readme == False:
has_readme = True
first_date_readme = dateutil.parser.parse(commit['data']['CommitDate'])
shutil.rmtree(full_temp_path, ignore_errors=True)
if me_read:
project_dict['first_readme'] = first_date_readme
before_read = pr_count(first_date_readme+ dt.timedelta(days=-window, hours=0), first_date_readme, commit_list)
project_dict['b6w_prs_read'] = before_read[0]
project_dict['b6w_mrg_read'] = before_read[1]
after_read = pr_count(first_date_readme, first_date_readme + dt.timedelta(days=window, hours=0), commit_list)
project_dict['a6w_prs_read'] = after_read[0]
project_dict['a6w_mrg_read'] = after_read[1]
else:
project_dict['first_contributing'] = first_date_contributing
before_cont = pr_count(first_date_contributing + dt.timedelta(days=-window, hours=0), first_date_contributing, commit_list)
project_dict['b6w_prs_cont'] = before_cont[0]
project_dict['b6w_mrg_cont'] = before_cont[1]
after_cont = pr_count(first_date_contributing, first_date_contributing + dt.timedelta(days=window, hours=0), commit_list)
project_dict['a6w_prs_cont'] = after_cont[0]
project_dict['a6w_mrg_cont'] = after_cont[1]
print(project_dict)
return project_dict
#TODO: pr_count should return an array of values for weekly/6mo
def pr_count(start, end, commits):
count = 0
merge_count = 0
by_week = [0] * 27
by_week_merge =[0] * 27
current_week = 0
for commit in tqdm(commits):
if dateutil.parser.parse(commit[0]) > start:
if math.floor((dateutil.parser.parse(commit[0]) - start).days / 7) <= 26:
by_week[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
if commit[1]:
by_week_merge[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
if dateutil.parser.parse(commit[0]) > end:
print(len(by_week))
return [by_week, by_week_merge]
#TODO: need to do this for all files in the dataset of readme or contributing
def for_files():
csv_path = "final_data/kk_final_readme_roster.csv"
count = 0
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
with open('kk_test_031024_pr_did.csv', "w") as writing_file:
keys = ['upstream_vcs_link', "first_readme", "b6w_prs_read", "b6w_mrg_read", "a6w_prs_read", "a6w_mrg_read"]
dict_writer = csv.DictWriter(writing_file, keys)
dict_writer.writeheader()
#training wheels on right now
for row in csv_reader:
count += 1
print(row['upstream_vcs_link'])
dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True)
dict_writer.writerow(dict_row)
if __name__ == "__main__":
for_files()
#file_get_pr("https://github.com/tqdm/tqdm", False)
#file_get_pr("https://github.com/GameServerManagers/LinuxGSM")