import git from tqdm import tqdm import csv import os import shutil import time import pandas as pd import datetime import argparse ''' RUNNING INSTRUCTIONS: [1] set up tmux environment [2] edit this file where marked "FIX BELOW" [3] install pip packages [4] in your tmux environment, run the following three commands - os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no' - os.environ['GIT_ASKPASS'] = 'false' - os.environ['GIT_TERMINAL_PROMPT'] = '0' [5] in tmux, run the script as follows with your START and STOP values - python3 intermediary_script.py --start_index START --stop_index STOP [6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs THANK YOU VERY MUCH - matt ''' #FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here temp_dir = "/data/users/mgaughan/tmp3/" cst = datetime.timezone(datetime.timedelta(hours=-6)) from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst) to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst) #FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/" def temp_clone(vcs_link, temp_location): """ ARGS vcs_link : url link to upstream repo vcs temp_location : filepath to where the repo should be cloned to RETURNS repo : the GitRepository object of the cloned repo repo_path : the filepath to the cloned repository """ #print(temp_location) vcs_link = vcs_link.strip() os.makedirs(temp_location) repo_path = temp_location repo = git.Repo.clone_from(vcs_link, repo_path) print(f"Successfully Cloned {vcs_link}") return repo, repo_path def delete_clone(temp_location): """ ARGS temp_location : filepath to the cloned repository RETURNS whether or not the deletion was a success """ if os.path.exists(temp_location): shutil.rmtree(temp_location) print(f"{temp_location} has been deleted.") return 0 else: print("No clone at location") return 1 # parses through commits in reverse chronological order, hence the flipping of the terms for the arguments def commit_analysis(repo, cutoff_date, start_date): print("Analyzing Commits...") commits_info = [] for commit in repo.iter_commits(): # if too far back, break if commit.committed_datetime > start_date: continue if commit.committed_datetime < cutoff_date: break commit_info = { "commit_hash": commit.hexsha, "author_name": commit.author.name, "author_email": commit.author.email, "authored_date": commit.authored_datetime, "committer_name": commit.committer.name, "committer_email": commit.committer.email, "commit_date": commit.committed_datetime, "message": commit.message, "is_merge": len(commit.parents) > 1, } # author/committer org information commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0] commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0] # some more effort to get this information commit_info["branches"] = repo.git.branch( "--contains", commit_info["commit_hash"] ) # diff information diffs = commit.diff( commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True ) commit_info["diff_info"] = diff_analysis(diffs) # print(commit_info) commits_info.append(commit_info) return commits_info def diff_analysis(diffs): diff_objects = [] for diff in diffs: diff_info = { "lines_added": sum( 1 for line in diff.diff.decode("utf-8").split("\n") if line.startswith("+") and not line.startswith("+++") ), "lines_deleted": sum( 1 for line in diff.diff.decode("utf-8").split("\n") if line.startswith("-") and not line.startswith("---") ), "parent_filepath": diff.a_path, "child_filepath": diff.b_path, "change_type": diff.change_type, "new_file": diff.new_file, "deleted_file": diff.deleted_file, "renamed_file": diff.renamed, #'diff': diff.diff.decode('utf-8') } diff_objects.append(diff_info) return diff_objects def for_all_files(start_index, stop_index): cwd = os.getcwd() csv_path = "../final_data/deb_full_data.csv" index = -1 saved = [] empty_row = 0 clone_error =[] has_readme = 0 has_contributing = 0 try: with open(csv_path, 'r') as file: csv_reader = csv.DictReader(file) lines = [line for line in file] for row in tqdm(csv.reader(lines), total=len(lines)): index += 1 if index < start_index: continue time.sleep(4) if row[0] == "": empty_row += 1 continue #row[5] = upstream vcs temp_repo_path = "" und_repo_id = "" try: os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no' os.environ['GIT_ASKPASS'] = 'false' os.environ['GIT_TERMINAL_PROMPT'] = '0' ssh_url = "" try: if "github" in row[5]: repo_id = row[5][len('https://github.com/'):] ssh_url = f'git@github.com:{repo_id}.git' if ssh_url.endswith('.git.git'): ssh_url = ssh_url[:-4] temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir) else: parts = row[5].split('/') domain = parts[2] repo_id = '/'.join(parts[3:]) try: temp_repo, temp_repo_path = temp_clone(row[5], temp_dir) except Exception as e: print(f'non-Github cloning error, assuming HTTPS issue: {e}') delete_clone(temp_dir) ssh_url = f'git@{domain}:{repo_id}.git' if ssh_url.endswith('.git.git'): ssh_url = ssh_url[:-4] temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir) except Exception as e: print(f'cloning error at {row[5]}') print(f'inside cloning error: {e}') raise ValueError(e) os.chdir(temp_repo_path) os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00'`") os.chdir(cwd) has_readme_bool, has_contributing_bool = False, False for filename in os.listdir(temp_repo_path): if filename.startswith("README"): has_readme_bool = True if filename.startswith("CONTRIBUTING"): has_contributing_bool = True if has_readme_bool or has_contributing_bool: commits_array = commit_analysis(temp_repo, from_date, to_date) commits_df = pd.DataFrame.from_records(commits_array) und_repo_id = '_'.join(repo_id.split("/")) if has_readme_bool: has_readme += 1 commits_df.to_csv( f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv", index=False, ) if has_contributing_bool: has_contributing += 1 commits_df.to_csv( f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv", index=False, ) except Exception as e: clone_error.append([row[5], e]) print(f"outside cloning error: {e}") finally: und_repo_id = "" delete_clone(temp_dir) os.chdir(cwd) if index == stop_index: break except KeyboardInterrupt: print("KeyBoardInterrrupt") finally: print(clone_error) with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file: for error in clone_error: txt_file.write(error + "\n") with open(f"{stop_index}-success-output.txt", "w") as txt_file: txt_file.write(f"Number of Empty Rows: {empty_row} \n") txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n") txt_file.write(f"Number that has README: {has_readme} \n") txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}") print(f"Number of Empty Rows: {empty_row}") print(f"Number of Cloning Errors: {len(clone_error)}") print(f"Number that has README: {has_readme}") print(f"Number that has CONTRIBUTING: {has_contributing}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="chase validation") parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search") parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search") args = parser.parse_args() for_all_files(args.start_index, args.stop_index) #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir) #delete_clone(temp_dir) #python3 intermediary_script.py --start_index START --stop_index STOP