24_deb_pkg_gov/12825_revision/intermediary_script.py

247 lines
10 KiB
Python

import git
from tqdm import tqdm
import csv
import os
import shutil
import time
import pandas as pd
import datetime
import argparse
'''
RUNNING INSTRUCTIONS:
[1] set up tmux environment
[2] edit this file where marked "FIX BELOW"
[3] install pip packages
[4] in your tmux environment, run the following three commands
- os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
- os.environ['GIT_ASKPASS'] = 'false'
- os.environ['GIT_TERMINAL_PROMPT'] = '0'
[5] in tmux, run the script as follows with your START and STOP values
- python3 intermediary_script.py --start_index START --stop_index STOP
[6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs
THANK YOU VERY MUCH - matt
'''
#FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here
temp_dir = "/data/users/mgaughan/tmp3/"
cst = datetime.timezone(datetime.timedelta(hours=-6))
from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst)
to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
#FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them
COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/"
def temp_clone(vcs_link, temp_location):
"""
ARGS
vcs_link : url link to upstream repo vcs
temp_location : filepath to where the repo should be cloned to
RETURNS
repo : the GitRepository object of the cloned repo
repo_path : the filepath to the cloned repository
"""
#print(temp_location)
vcs_link = vcs_link.strip()
os.makedirs(temp_location)
repo_path = temp_location
repo = git.Repo.clone_from(vcs_link, repo_path)
print(f"Successfully Cloned {vcs_link}")
return repo, repo_path
def delete_clone(temp_location):
"""
ARGS
temp_location : filepath to the cloned repository
RETURNS
whether or not the deletion was a success
"""
if os.path.exists(temp_location):
shutil.rmtree(temp_location)
print(f"{temp_location} has been deleted.")
return 0
else:
print("No clone at location")
return 1
# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
def commit_analysis(repo, cutoff_date, start_date):
print("Analyzing Commits...")
commits_info = []
for commit in repo.iter_commits():
# if too far back, break
if commit.committed_datetime > start_date:
continue
if commit.committed_datetime < cutoff_date:
break
commit_info = {
"commit_hash": commit.hexsha,
"author_name": commit.author.name,
"author_email": commit.author.email,
"authored_date": commit.authored_datetime,
"committer_name": commit.committer.name,
"committer_email": commit.committer.email,
"commit_date": commit.committed_datetime,
"message": commit.message,
"is_merge": len(commit.parents) > 1,
}
# author/committer org information
commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
# some more effort to get this information
commit_info["branches"] = repo.git.branch(
"--contains", commit_info["commit_hash"]
)
# diff information
diffs = commit.diff(
commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
)
commit_info["diff_info"] = diff_analysis(diffs)
# print(commit_info)
commits_info.append(commit_info)
return commits_info
def diff_analysis(diffs):
diff_objects = []
for diff in diffs:
diff_info = {
"lines_added": sum(
1
for line in diff.diff.decode("utf-8", errors="ignore").split("\n")
if line.startswith("+") and not line.startswith("+++")
),
"lines_deleted": sum(
1
for line in diff.diff.decode("utf-8", errors="ignore").split("\n")
if line.startswith("-") and not line.startswith("---")
),
"parent_filepath": diff.a_path,
"child_filepath": diff.b_path,
"change_type": diff.change_type,
"new_file": diff.new_file,
"deleted_file": diff.deleted_file,
"renamed_file": diff.renamed,
#'diff': diff.diff.decode('utf-8')
}
diff_objects.append(diff_info)
return diff_objects
def for_all_files(start_index, stop_index):
cwd = os.getcwd()
csv_path = "for_batching/deb_full_data.csv"
index = -1
saved = []
empty_row = 0
clone_error =[]
has_readme = 0
has_contributing = 0
try:
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
if index < start_index:
continue
time.sleep(4)
if row[0] == "":
empty_row += 1
continue
#row[5] = upstream vcs
temp_repo_path = ""
und_repo_id = ""
try:
os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
os.environ['GIT_ASKPASS'] = 'false'
os.environ['GIT_TERMINAL_PROMPT'] = '0'
ssh_url = ""
try:
if "github" in row[5]:
repo_id = row[5][len('https://github.com/'):]
ssh_url = f'git@github.com:{repo_id}.git'
if ssh_url.endswith('.git.git'):
ssh_url = ssh_url[:-4]
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
else:
parts = row[5].split('/')
domain = parts[2]
repo_id = '/'.join(parts[3:])
try:
temp_repo, temp_repo_path = temp_clone(row[5], temp_dir)
except Exception as e:
print(f'non-Github cloning error, assuming HTTPS issue: {e}')
delete_clone(temp_dir)
ssh_url = f'git@{domain}:{repo_id}.git'
if ssh_url.endswith('.git.git'):
ssh_url = ssh_url[:-4]
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
except Exception as e:
print(f'cloning error at {row[5]}')
print(f'inside cloning error: {e}')
raise ValueError(e)
os.chdir(temp_repo_path)
os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00'`")
os.chdir(cwd)
has_readme_bool, has_contributing_bool = False, False
for filename in os.listdir(temp_repo_path):
if filename.startswith("README"):
has_readme_bool = True
if filename.startswith("CONTRIBUTING"):
has_contributing_bool = True
if has_readme_bool or has_contributing_bool:
commits_array = commit_analysis(temp_repo, from_date, to_date)
commits_df = pd.DataFrame.from_records(commits_array)
und_repo_id = '_'.join(repo_id.split("/"))
if has_readme_bool:
has_readme += 1
commits_df.to_csv(
f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv",
index=False,
)
if has_contributing_bool:
has_contributing += 1
commits_df.to_csv(
f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv",
index=False,
)
except Exception as e:
clone_error.append([row[5], str(e)])
print(f"outside cloning error: {e}")
finally:
und_repo_id = ""
delete_clone(temp_dir)
os.chdir(cwd)
if index == stop_index:
break
except KeyboardInterrupt:
print("KeyBoardInterrrupt")
finally:
print(clone_error)
with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file:
for error in clone_error:
txt_file.write(', '.join(error) + "\n")
with open(f"{stop_index}-success-output.txt", "w") as txt_file:
txt_file.write(f"Number of Empty Rows: {empty_row} \n")
txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
txt_file.write(f"Number that has README: {has_readme} \n")
txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
print(f"Number of Empty Rows: {empty_row}")
print(f"Number of Cloning Errors: {len(clone_error)}")
print(f"Number that has README: {has_readme}")
print(f"Number that has CONTRIBUTING: {has_contributing}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="chase validation")
parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search")
parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search")
args = parser.parse_args()
for_all_files(args.start_index, args.stop_index)
#temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
#delete_clone(temp_dir)
#python3 intermediary_script.py --start_index START --stop_index STOP