224 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			224 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import git 
 | 
						|
from tqdm import tqdm
 | 
						|
import csv
 | 
						|
import os
 | 
						|
import shutil
 | 
						|
import time
 | 
						|
import pandas as pd
 | 
						|
import datetime
 | 
						|
import argparse
 | 
						|
 | 
						|
temp_dir = "/data/users/mgaughan/tmp3/"
 | 
						|
cst = datetime.timezone(datetime.timedelta(hours=-6))
 | 
						|
from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst)
 | 
						|
to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
 | 
						|
COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/"
 | 
						|
 | 
						|
def temp_clone(vcs_link, temp_location):
 | 
						|
    """
 | 
						|
    ARGS
 | 
						|
    vcs_link : url link to upstream repo vcs
 | 
						|
    temp_location : filepath to where the repo should be cloned to 
 | 
						|
 | 
						|
    RETURNS
 | 
						|
    repo : the GitRepository object of the cloned repo 
 | 
						|
    repo_path : the filepath to the cloned repository
 | 
						|
    """
 | 
						|
    #print(temp_location)
 | 
						|
    vcs_link = vcs_link.strip()
 | 
						|
    os.makedirs(temp_location)
 | 
						|
    repo_path = temp_location
 | 
						|
    repo = git.Repo.clone_from(vcs_link, repo_path)
 | 
						|
    print(f"Successfully Cloned {vcs_link}")
 | 
						|
    return repo, repo_path
 | 
						|
 | 
						|
 | 
						|
def delete_clone(temp_location):
 | 
						|
    """
 | 
						|
    ARGS
 | 
						|
    temp_location : filepath to the cloned repository 
 | 
						|
 | 
						|
    RETURNS
 | 
						|
    whether or not the deletion was a success
 | 
						|
    """
 | 
						|
    if os.path.exists(temp_location):
 | 
						|
        shutil.rmtree(temp_location)
 | 
						|
        print(f"{temp_location} has been deleted.")
 | 
						|
        return 0
 | 
						|
    else:
 | 
						|
        print("No clone at location")
 | 
						|
        return 1
 | 
						|
    
 | 
						|
# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
 | 
						|
def commit_analysis(repo, cutoff_date, start_date):
 | 
						|
    print("Analyzing Commits...")
 | 
						|
    commits_info = []
 | 
						|
    for commit in repo.iter_commits():
 | 
						|
        # if too far back, break
 | 
						|
        if commit.committed_datetime > start_date:
 | 
						|
            continue
 | 
						|
        if commit.committed_datetime < cutoff_date:
 | 
						|
            break
 | 
						|
        commit_info = {
 | 
						|
            "commit_hash": commit.hexsha,
 | 
						|
            "author_name": commit.author.name,
 | 
						|
            "author_email": commit.author.email,
 | 
						|
            "authored_date": commit.authored_datetime,
 | 
						|
            "committer_name": commit.committer.name,
 | 
						|
            "committer_email": commit.committer.email,
 | 
						|
            "commit_date": commit.committed_datetime,
 | 
						|
            "message": commit.message,
 | 
						|
            "is_merge": len(commit.parents) > 1,
 | 
						|
        }
 | 
						|
        # author/committer org information 
 | 
						|
        commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
 | 
						|
        commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
 | 
						|
        # some more effort to get this information
 | 
						|
        commit_info["branches"] = repo.git.branch(
 | 
						|
            "--contains", commit_info["commit_hash"]
 | 
						|
        )
 | 
						|
        # diff information
 | 
						|
        diffs = commit.diff(
 | 
						|
            commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
 | 
						|
        )
 | 
						|
        commit_info["diff_info"] = diff_analysis(diffs)
 | 
						|
        # print(commit_info)
 | 
						|
        commits_info.append(commit_info)
 | 
						|
    return commits_info
 | 
						|
 | 
						|
 | 
						|
def diff_analysis(diffs):
 | 
						|
    diff_objects = []
 | 
						|
    for diff in diffs:
 | 
						|
        diff_info = {
 | 
						|
            "lines_added": sum(
 | 
						|
                1
 | 
						|
                for line in diff.diff.decode("utf-8").split("\n")
 | 
						|
                if line.startswith("+") and not line.startswith("+++")
 | 
						|
            ),
 | 
						|
            "lines_deleted": sum(
 | 
						|
                1
 | 
						|
                for line in diff.diff.decode("utf-8").split("\n")
 | 
						|
                if line.startswith("-") and not line.startswith("---")
 | 
						|
            ),
 | 
						|
            "parent_filepath": diff.a_path,
 | 
						|
            "child_filepath": diff.b_path,
 | 
						|
            "change_type": diff.change_type,
 | 
						|
            "new_file": diff.new_file,
 | 
						|
            "deleted_file": diff.deleted_file,
 | 
						|
            "renamed_file": diff.renamed,
 | 
						|
            #'diff': diff.diff.decode('utf-8')
 | 
						|
        }
 | 
						|
        diff_objects.append(diff_info)
 | 
						|
    return diff_objects
 | 
						|
 | 
						|
def for_all_files(start_index, stop_index):
 | 
						|
    cwd = os.getcwd()
 | 
						|
    csv_path = "../final_data/deb_full_data.csv"
 | 
						|
    index = -1
 | 
						|
    saved = []
 | 
						|
    empty_row = 0
 | 
						|
    clone_error =[]
 | 
						|
    has_readme = 0
 | 
						|
    has_contributing = 0
 | 
						|
    with open(csv_path, 'r') as file:
 | 
						|
        csv_reader = csv.DictReader(file) 
 | 
						|
        lines = [line for line in file]
 | 
						|
        for row in tqdm(csv.reader(lines), total=len(lines)):
 | 
						|
            index += 1
 | 
						|
            #time.sleep(5)
 | 
						|
            if index < start_index:
 | 
						|
                continue
 | 
						|
            if row[0] == "":
 | 
						|
                empty_row += 1
 | 
						|
                continue
 | 
						|
            #row[5] = upstream vcs
 | 
						|
            temp_repo_path = ""
 | 
						|
            und_repo_id = ""
 | 
						|
            try:
 | 
						|
                os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
 | 
						|
                os.environ['GIT_ASKPASS'] = 'false'
 | 
						|
                os.environ['GIT_TERMINAL_PROMPT'] = '0'
 | 
						|
                ssh_url = ""
 | 
						|
                try: 
 | 
						|
                    if "github" in row[5]:
 | 
						|
                        repo_id = row[5][len('https://github.com/'):]
 | 
						|
                        ssh_url = f'git@github.com:{repo_id}.git'
 | 
						|
                        if ssh_url.endswith('.git.git'):
 | 
						|
                            ssh_url = ssh_url[:-4]
 | 
						|
                        temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
 | 
						|
                    else:
 | 
						|
                        parts = row[5].split('/')
 | 
						|
                        domain = parts[2]
 | 
						|
                        repo_id = '/'.join(parts[3:])
 | 
						|
                        try:
 | 
						|
                            temp_repo, temp_repo_path =  temp_clone(row[5], temp_dir)
 | 
						|
                        except Exception as e:
 | 
						|
                            print(f'non-Github cloning error, assuming HTTPS issue: {e}')
 | 
						|
                            delete_clone(temp_dir)
 | 
						|
                            ssh_url = f'git@{domain}:{repo_id}.git'
 | 
						|
                            if ssh_url.endswith('.git.git'):
 | 
						|
                                ssh_url = ssh_url[:-4]
 | 
						|
                            temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
 | 
						|
                except Exception as e:
 | 
						|
                    print(f'cloning error at {row[5]}')
 | 
						|
                    print(f'inside cloning error: {e}')
 | 
						|
                    raise ValueError(e)
 | 
						|
                os.chdir(temp_repo_path)
 | 
						|
                os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`")
 | 
						|
                os.chdir(cwd)
 | 
						|
                has_readme_bool, has_contributing_bool = False, False
 | 
						|
                for filename in os.listdir(temp_repo_path):
 | 
						|
                    if filename.startswith("README"):
 | 
						|
                        has_readme_bool = True
 | 
						|
                    if filename.startswith("CONTRIBUTING"):
 | 
						|
                        has_contributing_bool = True
 | 
						|
                if has_readme_bool or has_contributing_bool:
 | 
						|
                    commits_array = commit_analysis(temp_repo, from_date, to_date)
 | 
						|
                    commits_df = pd.DataFrame.from_records(commits_array)
 | 
						|
                    und_repo_id = '_'.join(repo_id.split("/"))
 | 
						|
                if has_readme_bool:
 | 
						|
                    has_readme += 1
 | 
						|
                    commits_df.to_csv(
 | 
						|
                        f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv",
 | 
						|
                        index=False,
 | 
						|
                        )
 | 
						|
                if has_contributing_bool:
 | 
						|
                    has_contributing += 1
 | 
						|
                    commits_df.to_csv(
 | 
						|
                        f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv",
 | 
						|
                        index=False,
 | 
						|
                        )
 | 
						|
            except Exception as e:
 | 
						|
                clone_error.append([row[5], e])
 | 
						|
                print(f"outside cloning error: {e}")
 | 
						|
            finally:
 | 
						|
                und_repo_id = ""
 | 
						|
                delete_clone(temp_dir)
 | 
						|
                os.chdir(cwd)
 | 
						|
 | 
						|
            if index == stop_index:
 | 
						|
                break
 | 
						|
 | 
						|
    print(clone_error)
 | 
						|
    with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file:
 | 
						|
            for error in clone_error:
 | 
						|
                txt_file.write(error + "\n")
 | 
						|
    with open(f"{stop_index}-success-output.txt", "w") as txt_file:
 | 
						|
        txt_file.write(f"Number of Empty Rows: {empty_row} \n")
 | 
						|
        txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
 | 
						|
        txt_file.write(f"Number that has README: {has_readme} \n")
 | 
						|
        txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
 | 
						|
    print(f"Number of Empty Rows: {empty_row}")
 | 
						|
    print(f"Number of Cloning Errors: {len(clone_error)}")
 | 
						|
    print(f"Number that has README: {has_readme}")
 | 
						|
    print(f"Number that has CONTRIBUTING: {has_contributing}")
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    parser = argparse.ArgumentParser(description="chase validation")
 | 
						|
    parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search")
 | 
						|
    parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search")
 | 
						|
    args = parser.parse_args()
 | 
						|
    for_all_files(args.start_index, args.stop_index)
 | 
						|
    #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
 | 
						|
    #delete_clone(temp_dir) |