backing up with revision

This commit is contained in:
Matthew Gaughan 2025-01-28 23:04:51 -06:00
parent d6e671c7a5
commit 2d9ce17e3a
3 changed files with 515 additions and 0 deletions

View File

@ -0,0 +1,116 @@
import csv
from git import Repo
from perceval.backends.core.git import Git
import os
import datetime as dt
import time
import shutil
import pandas as pd
import dateutil
from tqdm import tqdm
import math
import io
import re
working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2"
temp_dir = "/data/users/mgaughan/tmp3/"
# getting the specific readme or contributing file from a given commit
# inputs: upstream vcs link, commit hash, yes/no is it a readme
def get_file(vcs_link, commit_hash, is_readme):
if "github" in vcs_link or "gitlab" in vcs_link:
#making an evaluation that sub branches aren't being used and that people would fork if needed
#this only looks at main
vcs_link = "/".join(vcs_link.split("/")[0:5])
full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
other_temp_path = temp_dir + vcs_link.split('/')[4] + ".git0"
else:
full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
other_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git0"
repo0 = Repo.clone_from(vcs_link, full_temp_path)
repo = Git(uri=vcs_link, gitpath=other_temp_path)
commit0 = repo0.commit(commit_hash)
commits = repo.fetch()
target_filename = ""
for commit in commits:
files = commit['data']['files']
for file in files:
if is_readme:
if "README" in file['file']:
#print(file['file'])
if "/" in file['file']:
target_filename = file['file'].split("/")[-1]
else:
target_filename = file['file']
else:
if "CONTRIBUTING" in file['file']:
if "/" in file['file']:
target_filename = file['file'].split("/")[-1]
else:
target_filename = file['file']
if target_filename == "":
shutil.rmtree(full_temp_path, ignore_errors=True)
shutil.rmtree(other_temp_path, ignore_errors=True)
return "NoFile"
targetfile = ""
for blob in commit0.tree.blobs:
#print(type(blob.path))
if "CONTRIBUTING" in blob.path:
targetfile = blob
#print(blob.path)
# why would a file not be in the commit tree? but would be in the directory?
#shutil.rmtree(full_temp_path, ignore_errors=True)
# return "KeyError -- the file is not in the commit tree"
if targetfile == "":
shutil.rmtree(full_temp_path, ignore_errors=True)
shutil.rmtree(other_temp_path, ignore_errors=True)
return "KeyError -- the file is not in the commit tree"
if is_readme:
last_path = "readme2"
else:
last_path = "contributing2"
with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file:
with io.BytesIO(targetfile.data_stream.read()) as f:
file.write(f.read().decode('utf-8', errors='ignore'))
#file.write(f.read())
file.close()
shutil.rmtree(full_temp_path, ignore_errors=True)
shutil.rmtree(other_temp_path, ignore_errors=True)
return "NoError"
def for_all_files():
#toggle this based on readme or contributing files
readme_is = False
csv_path = "final_data/deb_contrib_did_data.csv"
index = -1
saved = []
with open(csv_path, 'r') as file:
with open('e_031824_spec_errors.csv', "w") as writing_file:
csv_writer = csv.writer(writing_file)
with open("contributing_completed_downloads.csv", "w") as writing_file2:
csv_writer2 = csv.writer(writing_file2)
#csv_reader = csv.DictReader(file)
lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
if index == 0:
continue
if row[0] == "":
continue
#print(row[0])
return_value = get_file(row[0], row[2], readme_is)
if return_value != "NoError":
csv_writer.writerow([row[0], row[2], readme_is, return_value])
else:
if row[0] in saved:
continue
saved.append(row[0])
csv_writer2.writerow(row)
# if it is noError, just write the row down in a different csv
# there's an issue of duplicates, but just keep it moving
# if no duplicates -- just run it through
if __name__ == "__main__":
for_all_files()

View File

@ -0,0 +1,224 @@
import git
from tqdm import tqdm
import csv
import os
import shutil
import time
import pandas as pd
import datetime
import argparse
temp_dir = "/data/users/mgaughan/tmp3/"
cst = datetime.timezone(datetime.timedelta(hours=-6))
from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst)
to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/"
def temp_clone(vcs_link, temp_location):
"""
ARGS
vcs_link : url link to upstream repo vcs
temp_location : filepath to where the repo should be cloned to
RETURNS
repo : the GitRepository object of the cloned repo
repo_path : the filepath to the cloned repository
"""
#print(temp_location)
vcs_link = vcs_link.strip()
os.makedirs(temp_location)
repo_path = temp_location
repo = git.Repo.clone_from(vcs_link, repo_path)
print(f"Successfully Cloned {vcs_link}")
return repo, repo_path
def delete_clone(temp_location):
"""
ARGS
temp_location : filepath to the cloned repository
RETURNS
whether or not the deletion was a success
"""
if os.path.exists(temp_location):
shutil.rmtree(temp_location)
print(f"{temp_location} has been deleted.")
return 0
else:
print("No clone at location")
return 1
# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
def commit_analysis(repo, cutoff_date, start_date):
print("Analyzing Commits...")
commits_info = []
for commit in repo.iter_commits():
# if too far back, break
if commit.committed_datetime > start_date:
continue
if commit.committed_datetime < cutoff_date:
break
commit_info = {
"commit_hash": commit.hexsha,
"author_name": commit.author.name,
"author_email": commit.author.email,
"authored_date": commit.authored_datetime,
"committer_name": commit.committer.name,
"committer_email": commit.committer.email,
"commit_date": commit.committed_datetime,
"message": commit.message,
"is_merge": len(commit.parents) > 1,
}
# author/committer org information
commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
# some more effort to get this information
commit_info["branches"] = repo.git.branch(
"--contains", commit_info["commit_hash"]
)
# diff information
diffs = commit.diff(
commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
)
commit_info["diff_info"] = diff_analysis(diffs)
# print(commit_info)
commits_info.append(commit_info)
return commits_info
def diff_analysis(diffs):
diff_objects = []
for diff in diffs:
diff_info = {
"lines_added": sum(
1
for line in diff.diff.decode("utf-8").split("\n")
if line.startswith("+") and not line.startswith("+++")
),
"lines_deleted": sum(
1
for line in diff.diff.decode("utf-8").split("\n")
if line.startswith("-") and not line.startswith("---")
),
"parent_filepath": diff.a_path,
"child_filepath": diff.b_path,
"change_type": diff.change_type,
"new_file": diff.new_file,
"deleted_file": diff.deleted_file,
"renamed_file": diff.renamed,
#'diff': diff.diff.decode('utf-8')
}
diff_objects.append(diff_info)
return diff_objects
def for_all_files(start_index, stop_index):
cwd = os.getcwd()
csv_path = "../final_data/deb_full_data.csv"
index = -1
saved = []
empty_row = 0
clone_error =[]
has_readme = 0
has_contributing = 0
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
#time.sleep(5)
if index < start_index:
continue
if row[0] == "":
empty_row += 1
continue
#row[5] = upstream vcs
temp_repo_path = ""
und_repo_id = ""
try:
os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
os.environ['GIT_ASKPASS'] = 'false'
os.environ['GIT_TERMINAL_PROMPT'] = '0'
ssh_url = ""
try:
if "github" in row[5]:
repo_id = row[5][len('https://github.com/'):]
ssh_url = f'git@github.com:{repo_id}.git'
if ssh_url.endswith('.git.git'):
ssh_url = ssh_url[:-4]
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
else:
parts = row[5].split('/')
domain = parts[2]
repo_id = '/'.join(parts[3:])
try:
temp_repo, temp_repo_path = temp_clone(row[5], temp_dir)
except Exception as e:
print(f'non-Github cloning error, assuming HTTPS issue: {e}')
delete_clone(temp_dir)
ssh_url = f'git@{domain}:{repo_id}.git'
if ssh_url.endswith('.git.git'):
ssh_url = ssh_url[:-4]
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
except Exception as e:
print(f'cloning error at {row[5]}')
print(f'inside cloning error: {e}')
raise ValueError(e)
os.chdir(temp_repo_path)
os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`")
os.chdir(cwd)
has_readme_bool, has_contributing_bool = False, False
for filename in os.listdir(temp_repo_path):
if filename.startswith("README"):
has_readme_bool = True
if filename.startswith("CONTRIBUTING"):
has_contributing_bool = True
if has_readme_bool or has_contributing_bool:
commits_array = commit_analysis(temp_repo, from_date, to_date)
commits_df = pd.DataFrame.from_records(commits_array)
und_repo_id = '_'.join(repo_id.split("/"))
if has_readme_bool:
has_readme += 1
commits_df.to_csv(
f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv",
index=False,
)
if has_contributing_bool:
has_contributing += 1
commits_df.to_csv(
f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv",
index=False,
)
except Exception as e:
clone_error.append([row[5], e])
print(f"outside cloning error: {e}")
finally:
und_repo_id = ""
delete_clone(temp_dir)
os.chdir(cwd)
if index == stop_index:
break
print(clone_error)
with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file:
for error in clone_error:
txt_file.write(error + "\n")
with open(f"{stop_index}-success-output.txt", "w") as txt_file:
txt_file.write(f"Number of Empty Rows: {empty_row} \n")
txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
txt_file.write(f"Number that has README: {has_readme} \n")
txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
print(f"Number of Empty Rows: {empty_row}")
print(f"Number of Cloning Errors: {len(clone_error)}")
print(f"Number that has README: {has_readme}")
print(f"Number that has CONTRIBUTING: {has_contributing}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="chase validation")
parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search")
parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search")
args = parser.parse_args()
for_all_files(args.start_index, args.stop_index)
#temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
#delete_clone(temp_dir)

View File

@ -0,0 +1,175 @@
import csv
from perceval.backends.core.git import Git
import os
import datetime as dt
import time
import shutil
import pandas as pd
import dateutil
from tqdm import tqdm
import math
key = os.environ.get('KKEXKEY')
early_cutoff = dt.datetime(2008,2, 8)
temp_dir = "/data/users/mgaughan/tmp/"
'''
- rate of change, rate of all/day
'''
def file_get_pr(upstream_vcs_link, me_read):
# if we're looking at readmes me_read is true and if not, if we're looking at contributing files, it's false
#this is the window of days on either side of the event that we're looking at
window = 182
#print(upstream_vcs_link.split('/')[4])
project_dict = {}
project_dict['upstream_vcs_link'] = upstream_vcs_link
upstream_vcs_link = upstream_vcs_link.strip()
if "github" in upstream_vcs_link or "gitlab" in upstream_vcs_link:
#making an evaluation that sub branches aren't being used and that people would fork if needed
#this only looks at main
upstream_vcs_link = "/".join(upstream_vcs_link.split("/")[0:5])
print(upstream_vcs_link)
full_temp_path = temp_dir + upstream_vcs_link.split('/')[4] + ".git"
else:
full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git"
print(upstream_vcs_link)
if upstream_vcs_link == "https://gitlab.com/ubports/core" or upstream_vcs_link == "https://gitlab.freedesktop.org/xorg/lib":
shutil.rmtree(full_temp_path, ignore_errors=True)
return {}
repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path)
try:
commits = repo.fetch()
except:
print("perceval issue")
return
has_readme = False
has_contributing = False
merge_pre_rm, merge_post_rm, merge_pre_cont, merge_post_cont = 0, 0, 0, 0
#list of tuples which has date and whether it was a merge
commit_list = []
first_date_readme = ""
for commit in commits:
#print(commit['data'])
if "Merge" in commit['data'].keys():
commit_list.append([commit['data']['CommitDate'], True, commit['data']['Author'], commit['data']['Commit']])
if has_contributing:
merge_post_cont += 1
else:
merge_pre_cont += 1
else:
commit_list.append([commit['data']['CommitDate'], False, commit['data']['Author'], commit['data']['Commit']])
files = commit['data']['files']
#print(commit['data']['CommitDate'])
#print(type(dateutil.parser.parse(commit['data']['CommitDate'])))
for file in files:
if "CONTRIBUTING" in file['file'] and has_contributing == False:
has_contributing = True
first_date_contributing = dateutil.parser.parse(commit['data']['CommitDate'])
if "README" in file['file'] and has_readme == False:
has_readme = True
first_date_readme = dateutil.parser.parse(commit['data']['CommitDate'])
project_dict['readme_commit_hash'] = commit['data']['commit']
shutil.rmtree(full_temp_path, ignore_errors=True)
if first_date_readme == "":
return {}
if me_read:
project_dict['first_readme'] = first_date_readme
before_read = pr_count(first_date_readme+ dt.timedelta(days=-window, hours=0), first_date_readme, commit_list, [], [])
if before_read != None:
project_dict['before_allcom_read'] = before_read[0]
project_dict['before_mrg_read'] = before_read[1]
project_dict['before_auth_new'] = before_read[2]
project_dict['before_commit_new'] = before_read[3]
else:
return {}
after_read = pr_count(first_date_readme, first_date_readme + dt.timedelta(days=window, hours=0), commit_list, before_read[4], before_read[5])
if after_read != None:
project_dict['after_allcom_read'] = after_read[0]
project_dict['after_mrg_read'] = after_read[1]
project_dict['after_auth_new'] = after_read[2]
project_dict['after_commit_new'] = after_read[3]
else:
return {}
else:
project_dict['first_contributing'] = first_date_contributing
before_cont = pr_count(first_date_contributing + dt.timedelta(days=-window, hours=0), first_date_contributing, commit_list, [], [])
if before_cont != None:
project_dict['before_allcom_cont'] = before_cont[0]
project_dict['before_mrg_cont'] = before_cont[1]
project_dict['before_auth_new'] = before_cont[2]
project_dict['before_commit_new'] = before_cont[3]
else:
return {}
after_cont = pr_count(first_date_contributing, first_date_contributing + dt.timedelta(days=window, hours=0), commit_list, before_cont[4], before_cont[5])
if after_cont != None:
project_dict['after_allcom_cont'] = after_cont[0]
project_dict['after_mrg_cont'] = after_cont[1]
project_dict['after_auth_new'] = after_cont[2]
project_dict['after_commit_new'] = after_cont[3]
else:
return {}
print(project_dict)
return project_dict
#TODO: pr_count should return an array of values for weekly/6mo
def pr_count(start, end, commits, author_roster, commit_roster):
count = 0
merge_count = 0
by_week = [0] * 27
by_week_merge =[0] * 27
current_week = 0
new_authors = 0
new_committers = 0
for commit in tqdm(commits):
if dateutil.parser.parse(commit[0]) <= start:
if commit[2] not in author_roster:
author_roster.append(commit[2])
if commit[1] and commit[3] not in commit_roster:
commit_roster.append(commit[3])
if dateutil.parser.parse(commit[0]) > start:
if math.floor((dateutil.parser.parse(commit[0]) - start).days / 7) <= 26:
by_week[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
if commit[1]:
by_week_merge[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
if commit[3] not in commit_roster:
new_committers += 1
#remaining question of whether to make this the author of the merge commit[2] or the committer of the merge commit[3]
commit_roster.append(commit[3])
if commit[2] not in author_roster:
new_authors += 1
author_roster.append(commit[2])
if dateutil.parser.parse(commit[0]) > end:
print(len(by_week))
return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster]
def for_files():
#csv_path = "final_data/deb_contribfile_roster.csv"
csv_path = "final_data/deb_readme_roster.csv"
count = 0
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
with open('kk_031624_pr_did.csv', "w") as writing_file:
# this would also have to get switched fro the cont dataset
keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
dict_writer = csv.DictWriter(writing_file, keys)
dict_writer.writeheader()
for row in csv_reader:
count += 1
print(row['upstream_vcs_link'])
# this would have to get switched to false for the cont dataset
try:
dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True)
except:
dict_row = {}
dict_writer.writerow(dict_row)
if __name__ == "__main__":
for_files()
#file_get_pr("https://github.com/tqdm/tqdm", True)
#file_get_pr("https://github.com/GameServerManagers/LinuxGSM", True)
#file_get_pr("https://github.com/walling/unorm/issues/new/", True)
#file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True)