224 lines
8.9 KiB
Python
224 lines
8.9 KiB
Python
import git
|
|
from tqdm import tqdm
|
|
import csv
|
|
import os
|
|
import shutil
|
|
import time
|
|
import pandas as pd
|
|
import datetime
|
|
import argparse
|
|
|
|
temp_dir = "/data/users/mgaughan/tmp3/"
|
|
cst = datetime.timezone(datetime.timedelta(hours=-6))
|
|
from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst)
|
|
to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
|
|
COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/"
|
|
|
|
def temp_clone(vcs_link, temp_location):
|
|
"""
|
|
ARGS
|
|
vcs_link : url link to upstream repo vcs
|
|
temp_location : filepath to where the repo should be cloned to
|
|
|
|
RETURNS
|
|
repo : the GitRepository object of the cloned repo
|
|
repo_path : the filepath to the cloned repository
|
|
"""
|
|
#print(temp_location)
|
|
vcs_link = vcs_link.strip()
|
|
os.makedirs(temp_location)
|
|
repo_path = temp_location
|
|
repo = git.Repo.clone_from(vcs_link, repo_path)
|
|
print(f"Successfully Cloned {vcs_link}")
|
|
return repo, repo_path
|
|
|
|
|
|
def delete_clone(temp_location):
|
|
"""
|
|
ARGS
|
|
temp_location : filepath to the cloned repository
|
|
|
|
RETURNS
|
|
whether or not the deletion was a success
|
|
"""
|
|
if os.path.exists(temp_location):
|
|
shutil.rmtree(temp_location)
|
|
print(f"{temp_location} has been deleted.")
|
|
return 0
|
|
else:
|
|
print("No clone at location")
|
|
return 1
|
|
|
|
# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
|
|
def commit_analysis(repo, cutoff_date, start_date):
|
|
print("Analyzing Commits...")
|
|
commits_info = []
|
|
for commit in repo.iter_commits():
|
|
# if too far back, break
|
|
if commit.committed_datetime > start_date:
|
|
continue
|
|
if commit.committed_datetime < cutoff_date:
|
|
break
|
|
commit_info = {
|
|
"commit_hash": commit.hexsha,
|
|
"author_name": commit.author.name,
|
|
"author_email": commit.author.email,
|
|
"authored_date": commit.authored_datetime,
|
|
"committer_name": commit.committer.name,
|
|
"committer_email": commit.committer.email,
|
|
"commit_date": commit.committed_datetime,
|
|
"message": commit.message,
|
|
"is_merge": len(commit.parents) > 1,
|
|
}
|
|
# author/committer org information
|
|
commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
|
|
commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
|
|
# some more effort to get this information
|
|
commit_info["branches"] = repo.git.branch(
|
|
"--contains", commit_info["commit_hash"]
|
|
)
|
|
# diff information
|
|
diffs = commit.diff(
|
|
commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
|
|
)
|
|
commit_info["diff_info"] = diff_analysis(diffs)
|
|
# print(commit_info)
|
|
commits_info.append(commit_info)
|
|
return commits_info
|
|
|
|
|
|
def diff_analysis(diffs):
|
|
diff_objects = []
|
|
for diff in diffs:
|
|
diff_info = {
|
|
"lines_added": sum(
|
|
1
|
|
for line in diff.diff.decode("utf-8").split("\n")
|
|
if line.startswith("+") and not line.startswith("+++")
|
|
),
|
|
"lines_deleted": sum(
|
|
1
|
|
for line in diff.diff.decode("utf-8").split("\n")
|
|
if line.startswith("-") and not line.startswith("---")
|
|
),
|
|
"parent_filepath": diff.a_path,
|
|
"child_filepath": diff.b_path,
|
|
"change_type": diff.change_type,
|
|
"new_file": diff.new_file,
|
|
"deleted_file": diff.deleted_file,
|
|
"renamed_file": diff.renamed,
|
|
#'diff': diff.diff.decode('utf-8')
|
|
}
|
|
diff_objects.append(diff_info)
|
|
return diff_objects
|
|
|
|
def for_all_files(start_index, stop_index):
|
|
cwd = os.getcwd()
|
|
csv_path = "../final_data/deb_full_data.csv"
|
|
index = -1
|
|
saved = []
|
|
empty_row = 0
|
|
clone_error =[]
|
|
has_readme = 0
|
|
has_contributing = 0
|
|
with open(csv_path, 'r') as file:
|
|
csv_reader = csv.DictReader(file)
|
|
lines = [line for line in file]
|
|
for row in tqdm(csv.reader(lines), total=len(lines)):
|
|
index += 1
|
|
#time.sleep(5)
|
|
if index < start_index:
|
|
continue
|
|
if row[0] == "":
|
|
empty_row += 1
|
|
continue
|
|
#row[5] = upstream vcs
|
|
temp_repo_path = ""
|
|
und_repo_id = ""
|
|
try:
|
|
os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
|
|
os.environ['GIT_ASKPASS'] = 'false'
|
|
os.environ['GIT_TERMINAL_PROMPT'] = '0'
|
|
ssh_url = ""
|
|
try:
|
|
if "github" in row[5]:
|
|
repo_id = row[5][len('https://github.com/'):]
|
|
ssh_url = f'git@github.com:{repo_id}.git'
|
|
if ssh_url.endswith('.git.git'):
|
|
ssh_url = ssh_url[:-4]
|
|
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
|
|
else:
|
|
parts = row[5].split('/')
|
|
domain = parts[2]
|
|
repo_id = '/'.join(parts[3:])
|
|
try:
|
|
temp_repo, temp_repo_path = temp_clone(row[5], temp_dir)
|
|
except Exception as e:
|
|
print(f'non-Github cloning error, assuming HTTPS issue: {e}')
|
|
delete_clone(temp_dir)
|
|
ssh_url = f'git@{domain}:{repo_id}.git'
|
|
if ssh_url.endswith('.git.git'):
|
|
ssh_url = ssh_url[:-4]
|
|
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
|
|
except Exception as e:
|
|
print(f'cloning error at {row[5]}')
|
|
print(f'inside cloning error: {e}')
|
|
raise ValueError(e)
|
|
os.chdir(temp_repo_path)
|
|
os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`")
|
|
os.chdir(cwd)
|
|
has_readme_bool, has_contributing_bool = False, False
|
|
for filename in os.listdir(temp_repo_path):
|
|
if filename.startswith("README"):
|
|
has_readme_bool = True
|
|
if filename.startswith("CONTRIBUTING"):
|
|
has_contributing_bool = True
|
|
if has_readme_bool or has_contributing_bool:
|
|
commits_array = commit_analysis(temp_repo, from_date, to_date)
|
|
commits_df = pd.DataFrame.from_records(commits_array)
|
|
und_repo_id = '_'.join(repo_id.split("/"))
|
|
if has_readme_bool:
|
|
has_readme += 1
|
|
commits_df.to_csv(
|
|
f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv",
|
|
index=False,
|
|
)
|
|
if has_contributing_bool:
|
|
has_contributing += 1
|
|
commits_df.to_csv(
|
|
f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv",
|
|
index=False,
|
|
)
|
|
except Exception as e:
|
|
clone_error.append([row[5], e])
|
|
print(f"outside cloning error: {e}")
|
|
finally:
|
|
und_repo_id = ""
|
|
delete_clone(temp_dir)
|
|
os.chdir(cwd)
|
|
|
|
if index == stop_index:
|
|
break
|
|
|
|
print(clone_error)
|
|
with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file:
|
|
for error in clone_error:
|
|
txt_file.write(error + "\n")
|
|
with open(f"{stop_index}-success-output.txt", "w") as txt_file:
|
|
txt_file.write(f"Number of Empty Rows: {empty_row} \n")
|
|
txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
|
|
txt_file.write(f"Number that has README: {has_readme} \n")
|
|
txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
|
|
print(f"Number of Empty Rows: {empty_row}")
|
|
print(f"Number of Cloning Errors: {len(clone_error)}")
|
|
print(f"Number that has README: {has_readme}")
|
|
print(f"Number that has CONTRIBUTING: {has_contributing}")
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="chase validation")
|
|
parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search")
|
|
parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search")
|
|
args = parser.parse_args()
|
|
for_all_files(args.start_index, args.stop_index)
|
|
#temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
|
|
#delete_clone(temp_dir) |