backing up with revision

2025-01-28 23:04:51 -06:00 · 2025-01-28 23:04:51 -06:00 · 2d9ce17e3a
commit 2d9ce17e3a
parent d6e671c7a5
3 changed files with 515 additions and 0 deletions
--- a/12825_revision/get_spec_file.py
+++ b/12825_revision/get_spec_file.py
@ -0,0 +1,116 @@
 import csv
 from git import Repo
 from perceval.backends.core.git import Git
 import os
 import datetime as dt
 import time
 import shutil 
 import pandas as pd
 import dateutil
 from tqdm import tqdm
 import math
 import io 
 import re
 working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2"
 temp_dir = "/data/users/mgaughan/tmp3/"
 # getting the specific readme or contributing file from a given commit 
 # inputs: upstream vcs link, commit hash, yes/no is it a readme 
 def get_file(vcs_link, commit_hash, is_readme):
    if "github" in vcs_link or "gitlab" in vcs_link:
        #making an evaluation that sub branches aren't being used and that people would fork if needed
        #this only looks at main
        vcs_link = "/".join(vcs_link.split("/")[0:5])
        full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
        other_temp_path = temp_dir + vcs_link.split('/')[4] + ".git0"
    else:
        full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
        other_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git0"
    repo0 = Repo.clone_from(vcs_link, full_temp_path)
    repo = Git(uri=vcs_link, gitpath=other_temp_path)
    commit0 = repo0.commit(commit_hash)
    commits = repo.fetch()
    target_filename = ""
    for commit in commits:
        files = commit['data']['files']
        for file in files:
            if is_readme:
                if "README" in file['file']:
                    #print(file['file'])
                    if "/" in file['file']:
                        target_filename = file['file'].split("/")[-1]
                    else:
                        target_filename = file['file']
            else:
                if "CONTRIBUTING" in file['file']:
                    if "/" in file['file']:
                        target_filename = file['file'].split("/")[-1]
                    else:
                        target_filename = file['file']
    if target_filename == "":
        shutil.rmtree(full_temp_path, ignore_errors=True)
        shutil.rmtree(other_temp_path, ignore_errors=True)
        return "NoFile"
    targetfile = ""
    for blob in commit0.tree.blobs:
        #print(type(blob.path))
        if "CONTRIBUTING" in blob.path:
            targetfile = blob
            #print(blob.path)
        # why would a file not be in the commit tree? but would be in the directory?
        #shutil.rmtree(full_temp_path, ignore_errors=True)
    #    return "KeyError -- the file is not in the commit tree"
    if targetfile == "":
        shutil.rmtree(full_temp_path, ignore_errors=True)
        shutil.rmtree(other_temp_path, ignore_errors=True)
        return "KeyError -- the file is not in the commit tree"
    if is_readme:
        last_path = "readme2"
    else:
        last_path = "contributing2"
    with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file:
        with io.BytesIO(targetfile.data_stream.read()) as f:
            file.write(f.read().decode('utf-8', errors='ignore'))
            #file.write(f.read())
        file.close()
    shutil.rmtree(full_temp_path, ignore_errors=True)
    shutil.rmtree(other_temp_path, ignore_errors=True)
    return "NoError"
 def for_all_files():
    #toggle this based on readme or contributing files
    readme_is = False
    csv_path = "final_data/deb_contrib_did_data.csv"
    index = -1
    saved = []
    with open(csv_path, 'r') as file:
        with open('e_031824_spec_errors.csv', "w") as writing_file:
            csv_writer = csv.writer(writing_file)
            with open("contributing_completed_downloads.csv", "w") as writing_file2:
                csv_writer2 = csv.writer(writing_file2)
                #csv_reader = csv.DictReader(file)
                lines = [line for line in file]
                for row in tqdm(csv.reader(lines), total=len(lines)):
                    index += 1
                    if index == 0:
                        continue
                    if row[0] == "":
                        continue
                    #print(row[0])
                    return_value = get_file(row[0], row[2], readme_is)
                    if return_value != "NoError":
                        csv_writer.writerow([row[0], row[2], readme_is, return_value])
                    else:
                        if row[0] in saved:
                            continue
                        saved.append(row[0])
                        csv_writer2.writerow(row)
                    # if it is noError, just write the row down in a different csv
                    # there's an issue of duplicates, but just keep it moving 
                    # if no duplicates -- just run it through
 if __name__ == "__main__":
    for_all_files()
--- a/12825_revision/intermediary_script.py
+++ b/12825_revision/intermediary_script.py
@ -0,0 +1,224 @@
 import git 
 from tqdm import tqdm
 import csv
 import os
 import shutil
 import time
 import pandas as pd
 import datetime
 import argparse
 temp_dir = "/data/users/mgaughan/tmp3/"
 cst = datetime.timezone(datetime.timedelta(hours=-6))
 from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst)
 to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
 COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/"
 def temp_clone(vcs_link, temp_location):
    """
    ARGS
    vcs_link : url link to upstream repo vcs
    temp_location : filepath to where the repo should be cloned to 
    RETURNS
    repo : the GitRepository object of the cloned repo 
    repo_path : the filepath to the cloned repository
    """
    #print(temp_location)
    vcs_link = vcs_link.strip()
    os.makedirs(temp_location)
    repo_path = temp_location
    repo = git.Repo.clone_from(vcs_link, repo_path)
    print(f"Successfully Cloned {vcs_link}")
    return repo, repo_path
 def delete_clone(temp_location):
    """
    ARGS
    temp_location : filepath to the cloned repository 
    RETURNS
    whether or not the deletion was a success
    """
    if os.path.exists(temp_location):
        shutil.rmtree(temp_location)
        print(f"{temp_location} has been deleted.")
        return 0
    else:
        print("No clone at location")
        return 1
 # parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
 def commit_analysis(repo, cutoff_date, start_date):
    print("Analyzing Commits...")
    commits_info = []
    for commit in repo.iter_commits():
        # if too far back, break
        if commit.committed_datetime > start_date:
            continue
        if commit.committed_datetime < cutoff_date:
            break
        commit_info = {
            "commit_hash": commit.hexsha,
            "author_name": commit.author.name,
            "author_email": commit.author.email,
            "authored_date": commit.authored_datetime,
            "committer_name": commit.committer.name,
            "committer_email": commit.committer.email,
            "commit_date": commit.committed_datetime,
            "message": commit.message,
            "is_merge": len(commit.parents) > 1,
        }
        # author/committer org information 
        commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
        commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
        # some more effort to get this information
        commit_info["branches"] = repo.git.branch(
            "--contains", commit_info["commit_hash"]
        )
        # diff information
        diffs = commit.diff(
            commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
        )
        commit_info["diff_info"] = diff_analysis(diffs)
        # print(commit_info)
        commits_info.append(commit_info)
    return commits_info
 def diff_analysis(diffs):
    diff_objects = []
    for diff in diffs:
        diff_info = {
            "lines_added": sum(
                1
                for line in diff.diff.decode("utf-8").split("\n")
                if line.startswith("+") and not line.startswith("+++")
            ),
            "lines_deleted": sum(
                1
                for line in diff.diff.decode("utf-8").split("\n")
                if line.startswith("-") and not line.startswith("---")
            ),
            "parent_filepath": diff.a_path,
            "child_filepath": diff.b_path,
            "change_type": diff.change_type,
            "new_file": diff.new_file,
            "deleted_file": diff.deleted_file,
            "renamed_file": diff.renamed,
            #'diff': diff.diff.decode('utf-8')
        }
        diff_objects.append(diff_info)
    return diff_objects
 def for_all_files(start_index, stop_index):
    cwd = os.getcwd()
    csv_path = "../final_data/deb_full_data.csv"
    index = -1
    saved = []
    empty_row = 0
    clone_error =[]
    has_readme = 0
    has_contributing = 0
    with open(csv_path, 'r') as file:
        csv_reader = csv.DictReader(file) 
        lines = [line for line in file]
        for row in tqdm(csv.reader(lines), total=len(lines)):
            index += 1
            #time.sleep(5)
            if index < start_index:
                continue
            if row[0] == "":
                empty_row += 1
                continue
            #row[5] = upstream vcs
            temp_repo_path = ""
            und_repo_id = ""
            try:
                os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
                os.environ['GIT_ASKPASS'] = 'false'
                os.environ['GIT_TERMINAL_PROMPT'] = '0'
                ssh_url = ""
                try: 
                    if "github" in row[5]:
                        repo_id = row[5][len('https://github.com/'):]
                        ssh_url = f'git@github.com:{repo_id}.git'
                        if ssh_url.endswith('.git.git'):
                            ssh_url = ssh_url[:-4]
                        temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
                    else:
                        parts = row[5].split('/')
                        domain = parts[2]
                        repo_id = '/'.join(parts[3:])
                        try:
                            temp_repo, temp_repo_path =  temp_clone(row[5], temp_dir)
                        except Exception as e:
                            print(f'non-Github cloning error, assuming HTTPS issue: {e}')
                            delete_clone(temp_dir)
                            ssh_url = f'git@{domain}:{repo_id}.git'
                            if ssh_url.endswith('.git.git'):
                                ssh_url = ssh_url[:-4]
                            temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
                except Exception as e:
                    print(f'cloning error at {row[5]}')
                    print(f'inside cloning error: {e}')
                    raise ValueError(e)
                os.chdir(temp_repo_path)
                os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`")
                os.chdir(cwd)
                has_readme_bool, has_contributing_bool = False, False
                for filename in os.listdir(temp_repo_path):
                    if filename.startswith("README"):
                        has_readme_bool = True
                    if filename.startswith("CONTRIBUTING"):
                        has_contributing_bool = True
                if has_readme_bool or has_contributing_bool:
                    commits_array = commit_analysis(temp_repo, from_date, to_date)
                    commits_df = pd.DataFrame.from_records(commits_array)
                    und_repo_id = '_'.join(repo_id.split("/"))
                if has_readme_bool:
                    has_readme += 1
                    commits_df.to_csv(
                        f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv",
                        index=False,
                        )
                if has_contributing_bool:
                    has_contributing += 1
                    commits_df.to_csv(
                        f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv",
                        index=False,
                        )
            except Exception as e:
                clone_error.append([row[5], e])
                print(f"outside cloning error: {e}")
            finally:
                und_repo_id = ""
                delete_clone(temp_dir)
                os.chdir(cwd)
            if index == stop_index:
                break
    print(clone_error)
    with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file:
            for error in clone_error:
                txt_file.write(error + "\n")
    with open(f"{stop_index}-success-output.txt", "w") as txt_file:
        txt_file.write(f"Number of Empty Rows: {empty_row} \n")
        txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
        txt_file.write(f"Number that has README: {has_readme} \n")
        txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
    print(f"Number of Empty Rows: {empty_row}")
    print(f"Number of Cloning Errors: {len(clone_error)}")
    print(f"Number that has README: {has_readme}")
    print(f"Number that has CONTRIBUTING: {has_contributing}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="chase validation")
    parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search")
    parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search")
    args = parser.parse_args()
    for_all_files(args.start_index, args.stop_index)
    #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
    #delete_clone(temp_dir)
--- a/12825_revision/pr_data_get.py
+++ b/12825_revision/pr_data_get.py
@ -0,0 +1,175 @@
 import csv
 from perceval.backends.core.git import Git
 import os
 import datetime as dt
 import time
 import shutil 
 import pandas as pd
 import dateutil
 from tqdm import tqdm
 import math
 key = os.environ.get('KKEXKEY')
 early_cutoff = dt.datetime(2008,2, 8)
 temp_dir = "/data/users/mgaughan/tmp/"
 '''
 - rate of change, rate of all/day
 '''
 def file_get_pr(upstream_vcs_link, me_read):
    # if we're looking at readmes me_read is true and if not, if we're looking at contributing files, it's false
    #this is the window of days on either side of the event that we're looking at
    window = 182
    #print(upstream_vcs_link.split('/')[4])
    project_dict = {}
    project_dict['upstream_vcs_link'] = upstream_vcs_link
    upstream_vcs_link = upstream_vcs_link.strip()
    if "github" in upstream_vcs_link or "gitlab" in upstream_vcs_link:
        #making an evaluation that sub branches aren't being used and that people would fork if needed
        #this only looks at main
        upstream_vcs_link = "/".join(upstream_vcs_link.split("/")[0:5])
        print(upstream_vcs_link)
        full_temp_path = temp_dir + upstream_vcs_link.split('/')[4] + ".git"
    else:
        full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git"
    print(upstream_vcs_link)
    if upstream_vcs_link == "https://gitlab.com/ubports/core" or upstream_vcs_link == "https://gitlab.freedesktop.org/xorg/lib":
        shutil.rmtree(full_temp_path, ignore_errors=True)
        return {}
    repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path)
    try:
        commits = repo.fetch()
    except:
        print("perceval issue")
        return
    has_readme = False
    has_contributing = False
    merge_pre_rm, merge_post_rm, merge_pre_cont, merge_post_cont = 0, 0, 0, 0
    #list of tuples which has date and whether it was a merge
    commit_list = []
    first_date_readme = ""
    for commit in commits:
        #print(commit['data'])
        if "Merge" in commit['data'].keys():
            commit_list.append([commit['data']['CommitDate'], True, commit['data']['Author'], commit['data']['Commit']])
            if has_contributing:
                merge_post_cont += 1
            else:
                merge_pre_cont += 1
        else: 
            commit_list.append([commit['data']['CommitDate'], False, commit['data']['Author'], commit['data']['Commit']])
        files = commit['data']['files']
        #print(commit['data']['CommitDate'])
        #print(type(dateutil.parser.parse(commit['data']['CommitDate'])))
        for file in files:
            if "CONTRIBUTING" in file['file'] and has_contributing == False:
                has_contributing = True
                first_date_contributing = dateutil.parser.parse(commit['data']['CommitDate'])
            if "README" in file['file'] and has_readme == False:
                has_readme = True
                first_date_readme = dateutil.parser.parse(commit['data']['CommitDate'])
                project_dict['readme_commit_hash'] = commit['data']['commit']
    shutil.rmtree(full_temp_path, ignore_errors=True)
    if first_date_readme == "":
        return {}
    if me_read:
        project_dict['first_readme'] = first_date_readme
        before_read = pr_count(first_date_readme+ dt.timedelta(days=-window, hours=0), first_date_readme, commit_list, [], [])
        if before_read != None:
            project_dict['before_allcom_read'] = before_read[0]
            project_dict['before_mrg_read'] = before_read[1]
            project_dict['before_auth_new'] = before_read[2]
            project_dict['before_commit_new'] = before_read[3]
        else:
            return {}
        after_read = pr_count(first_date_readme, first_date_readme + dt.timedelta(days=window, hours=0), commit_list, before_read[4], before_read[5])
        if after_read != None:
            project_dict['after_allcom_read'] = after_read[0]
            project_dict['after_mrg_read'] = after_read[1]
            project_dict['after_auth_new'] = after_read[2]
            project_dict['after_commit_new'] = after_read[3]
        else: 
            return {}
    else:
        project_dict['first_contributing'] = first_date_contributing
        before_cont = pr_count(first_date_contributing + dt.timedelta(days=-window, hours=0), first_date_contributing, commit_list, [], [])
        if before_cont != None:
            project_dict['before_allcom_cont'] = before_cont[0]
            project_dict['before_mrg_cont'] = before_cont[1]
            project_dict['before_auth_new'] = before_cont[2]
            project_dict['before_commit_new'] = before_cont[3]
        else:
            return {}
        after_cont = pr_count(first_date_contributing, first_date_contributing + dt.timedelta(days=window, hours=0), commit_list, before_cont[4], before_cont[5])
        if after_cont != None:
            project_dict['after_allcom_cont'] = after_cont[0]
            project_dict['after_mrg_cont'] = after_cont[1]
            project_dict['after_auth_new'] = after_cont[2]
            project_dict['after_commit_new'] = after_cont[3]
        else: 
            return {}
    print(project_dict)
    return project_dict
 #TODO: pr_count should return an array of values for weekly/6mo
 def pr_count(start, end, commits, author_roster, commit_roster):
    count = 0
    merge_count = 0
    by_week = [0] * 27
    by_week_merge =[0] * 27
    current_week = 0
    new_authors = 0
    new_committers = 0
    for commit in tqdm(commits):
        if dateutil.parser.parse(commit[0]) <= start:
            if commit[2] not in author_roster:
                author_roster.append(commit[2])
            if commit[1] and commit[3] not in commit_roster:
                commit_roster.append(commit[3])
        if dateutil.parser.parse(commit[0]) > start:
            if math.floor((dateutil.parser.parse(commit[0]) - start).days / 7) <= 26:
                by_week[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
                if commit[1]:
                    by_week_merge[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
                    if commit[3] not in commit_roster:
                        new_committers += 1
                        #remaining question of whether to make this the author of the merge commit[2] or the committer of the merge commit[3]
                        commit_roster.append(commit[3])
                if commit[2] not in author_roster:
                    new_authors += 1
                    author_roster.append(commit[2])
        if dateutil.parser.parse(commit[0]) > end:
            print(len(by_week))
            return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster]
 def for_files():
    #csv_path = "final_data/deb_contribfile_roster.csv"
    csv_path = "final_data/deb_readme_roster.csv"
    count = 0 
    with open(csv_path, 'r') as file:
        csv_reader = csv.DictReader(file)
        with open('kk_031624_pr_did.csv', "w") as writing_file:
            # this would also have to get switched fro the cont dataset
            keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
            dict_writer = csv.DictWriter(writing_file, keys)
            dict_writer.writeheader()
            for row in csv_reader:
                count += 1
                print(row['upstream_vcs_link'])
                # this would have to get switched to false for the cont dataset
                try:
                    dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True)
                except:
                    dict_row = {}
                dict_writer.writerow(dict_row)
 if __name__ == "__main__":
    for_files()
    #file_get_pr("https://github.com/tqdm/tqdm", True)
    #file_get_pr("https://github.com/GameServerManagers/LinuxGSM", True)
    #file_get_pr("https://github.com/walling/unorm/issues/new/", True)
    #file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True)