From c0b4a905c61e7c05dc6f92a41529165af60872c3 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <gaughan@u.northwestern.edu>
Date: Sun, 18 May 2025 18:55:08 -0500
Subject: [PATCH] uncommitted changes

---
 12825_revision/get_spec_file.py       | 116 -------------
 12825_revision/intermediary_script.py | 224 --------------------------
 12825_revision/pr_data_get.py         | 175 --------------------
 3 files changed, 515 deletions(-)
 delete mode 100644 12825_revision/get_spec_file.py
 delete mode 100644 12825_revision/intermediary_script.py
 delete mode 100644 12825_revision/pr_data_get.py

diff --git a/12825_revision/get_spec_file.py b/12825_revision/get_spec_file.py
deleted file mode 100644
index e0ee467..0000000
--- a/12825_revision/get_spec_file.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import csv
-from git import Repo
-from perceval.backends.core.git import Git
-import os
-import datetime as dt
-import time
-import shutil 
-import pandas as pd
-import dateutil
-from tqdm import tqdm
-import math
-import io 
-import re
-
-working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2"
-temp_dir = "/data/users/mgaughan/tmp3/"
-
-# getting the specific readme or contributing file from a given commit 
-# inputs: upstream vcs link, commit hash, yes/no is it a readme 
-def get_file(vcs_link, commit_hash, is_readme):
-    if "github" in vcs_link or "gitlab" in vcs_link:
-        #making an evaluation that sub branches aren't being used and that people would fork if needed
-        #this only looks at main
-        vcs_link = "/".join(vcs_link.split("/")[0:5])
-        full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
-        other_temp_path = temp_dir + vcs_link.split('/')[4] + ".git0"
-    else:
-        full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
-        other_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git0"
-    repo0 = Repo.clone_from(vcs_link, full_temp_path)
-    repo = Git(uri=vcs_link, gitpath=other_temp_path)
-    commit0 = repo0.commit(commit_hash)
-    commits = repo.fetch()
-    target_filename = ""
-    for commit in commits:
-        files = commit['data']['files']
-        for file in files:
-            if is_readme:
-                if "README" in file['file']:
-                    #print(file['file'])
-                    if "/" in file['file']:
-                        target_filename = file['file'].split("/")[-1]
-                    else:
-                        target_filename = file['file']
-            else:
-                if "CONTRIBUTING" in file['file']:
-                    if "/" in file['file']:
-                        target_filename = file['file'].split("/")[-1]
-                    else:
-                        target_filename = file['file']
-
-    if target_filename == "":
-        shutil.rmtree(full_temp_path, ignore_errors=True)
-        shutil.rmtree(other_temp_path, ignore_errors=True)
-        return "NoFile"
-
-    targetfile = ""
-    for blob in commit0.tree.blobs:
-        #print(type(blob.path))
-        if "CONTRIBUTING" in blob.path:
-            targetfile = blob
-            #print(blob.path)
-        # why would a file not be in the commit tree? but would be in the directory?
-        #shutil.rmtree(full_temp_path, ignore_errors=True)
-    #    return "KeyError -- the file is not in the commit tree"
-    if targetfile == "":
-        shutil.rmtree(full_temp_path, ignore_errors=True)
-        shutil.rmtree(other_temp_path, ignore_errors=True)
-        return "KeyError -- the file is not in the commit tree"
-    if is_readme:
-        last_path = "readme2"
-    else:
-        last_path = "contributing2"
-    with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file:
-        with io.BytesIO(targetfile.data_stream.read()) as f:
-            file.write(f.read().decode('utf-8', errors='ignore'))
-            #file.write(f.read())
-        file.close()
-    shutil.rmtree(full_temp_path, ignore_errors=True)
-    shutil.rmtree(other_temp_path, ignore_errors=True)
-    return "NoError"
-
-def for_all_files():
-    #toggle this based on readme or contributing files
-    readme_is = False
-    csv_path = "final_data/deb_contrib_did_data.csv"
-    index = -1
-    saved = []
-    with open(csv_path, 'r') as file:
-        with open('e_031824_spec_errors.csv', "w") as writing_file:
-            csv_writer = csv.writer(writing_file)
-            with open("contributing_completed_downloads.csv", "w") as writing_file2:
-                csv_writer2 = csv.writer(writing_file2)
-                #csv_reader = csv.DictReader(file)
-                lines = [line for line in file]
-                for row in tqdm(csv.reader(lines), total=len(lines)):
-                    index += 1
-                    if index == 0:
-                        continue
-                    if row[0] == "":
-                        continue
-                    #print(row[0])
-                    return_value = get_file(row[0], row[2], readme_is)
-                    if return_value != "NoError":
-                        csv_writer.writerow([row[0], row[2], readme_is, return_value])
-                    else:
-                        if row[0] in saved:
-                            continue
-                        saved.append(row[0])
-                        csv_writer2.writerow(row)
-                    # if it is noError, just write the row down in a different csv
-                    # there's an issue of duplicates, but just keep it moving 
-                    # if no duplicates -- just run it through
-
-if __name__ == "__main__":
-    for_all_files()
diff --git a/12825_revision/intermediary_script.py b/12825_revision/intermediary_script.py
deleted file mode 100644
index b41b324..0000000
--- a/12825_revision/intermediary_script.py
+++ /dev/null
@@ -1,224 +0,0 @@
-import git 
-from tqdm import tqdm
-import csv
-import os
-import shutil
-import time
-import pandas as pd
-import datetime
-import argparse
-
-temp_dir = "/data/users/mgaughan/tmp3/"
-cst = datetime.timezone(datetime.timedelta(hours=-6))
-from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst)
-to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
-COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/"
-
-def temp_clone(vcs_link, temp_location):
-    """
-    ARGS
-    vcs_link : url link to upstream repo vcs
-    temp_location : filepath to where the repo should be cloned to 
-
-    RETURNS
-    repo : the GitRepository object of the cloned repo 
-    repo_path : the filepath to the cloned repository
-    """
-    #print(temp_location)
-    vcs_link = vcs_link.strip()
-    os.makedirs(temp_location)
-    repo_path = temp_location
-    repo = git.Repo.clone_from(vcs_link, repo_path)
-    print(f"Successfully Cloned {vcs_link}")
-    return repo, repo_path
-
-
-def delete_clone(temp_location):
-    """
-    ARGS
-    temp_location : filepath to the cloned repository 
-
-    RETURNS
-    whether or not the deletion was a success
-    """
-    if os.path.exists(temp_location):
-        shutil.rmtree(temp_location)
-        print(f"{temp_location} has been deleted.")
-        return 0
-    else:
-        print("No clone at location")
-        return 1
-    
-# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
-def commit_analysis(repo, cutoff_date, start_date):
-    print("Analyzing Commits...")
-    commits_info = []
-    for commit in repo.iter_commits():
-        # if too far back, break
-        if commit.committed_datetime > start_date:
-            continue
-        if commit.committed_datetime < cutoff_date:
-            break
-        commit_info = {
-            "commit_hash": commit.hexsha,
-            "author_name": commit.author.name,
-            "author_email": commit.author.email,
-            "authored_date": commit.authored_datetime,
-            "committer_name": commit.committer.name,
-            "committer_email": commit.committer.email,
-            "commit_date": commit.committed_datetime,
-            "message": commit.message,
-            "is_merge": len(commit.parents) > 1,
-        }
-        # author/committer org information 
-        commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
-        commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
-        # some more effort to get this information
-        commit_info["branches"] = repo.git.branch(
-            "--contains", commit_info["commit_hash"]
-        )
-        # diff information
-        diffs = commit.diff(
-            commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
-        )
-        commit_info["diff_info"] = diff_analysis(diffs)
-        # print(commit_info)
-        commits_info.append(commit_info)
-    return commits_info
-
-
-def diff_analysis(diffs):
-    diff_objects = []
-    for diff in diffs:
-        diff_info = {
-            "lines_added": sum(
-                1
-                for line in diff.diff.decode("utf-8").split("\n")
-                if line.startswith("+") and not line.startswith("+++")
-            ),
-            "lines_deleted": sum(
-                1
-                for line in diff.diff.decode("utf-8").split("\n")
-                if line.startswith("-") and not line.startswith("---")
-            ),
-            "parent_filepath": diff.a_path,
-            "child_filepath": diff.b_path,
-            "change_type": diff.change_type,
-            "new_file": diff.new_file,
-            "deleted_file": diff.deleted_file,
-            "renamed_file": diff.renamed,
-            #'diff': diff.diff.decode('utf-8')
-        }
-        diff_objects.append(diff_info)
-    return diff_objects
-
-def for_all_files(start_index, stop_index):
-    cwd = os.getcwd()
-    csv_path = "../final_data/deb_full_data.csv"
-    index = -1
-    saved = []
-    empty_row = 0
-    clone_error =[]
-    has_readme = 0
-    has_contributing = 0
-    with open(csv_path, 'r') as file:
-        csv_reader = csv.DictReader(file) 
-        lines = [line for line in file]
-        for row in tqdm(csv.reader(lines), total=len(lines)):
-            index += 1
-            #time.sleep(5)
-            if index < start_index:
-                continue
-            if row[0] == "":
-                empty_row += 1
-                continue
-            #row[5] = upstream vcs
-            temp_repo_path = ""
-            und_repo_id = ""
-            try:
-                os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
-                os.environ['GIT_ASKPASS'] = 'false'
-                os.environ['GIT_TERMINAL_PROMPT'] = '0'
-                ssh_url = ""
-                try: 
-                    if "github" in row[5]:
-                        repo_id = row[5][len('https://github.com/'):]
-                        ssh_url = f'git@github.com:{repo_id}.git'
-                        if ssh_url.endswith('.git.git'):
-                            ssh_url = ssh_url[:-4]
-                        temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
-                    else:
-                        parts = row[5].split('/')
-                        domain = parts[2]
-                        repo_id = '/'.join(parts[3:])
-                        try:
-                            temp_repo, temp_repo_path =  temp_clone(row[5], temp_dir)
-                        except Exception as e:
-                            print(f'non-Github cloning error, assuming HTTPS issue: {e}')
-                            delete_clone(temp_dir)
-                            ssh_url = f'git@{domain}:{repo_id}.git'
-                            if ssh_url.endswith('.git.git'):
-                                ssh_url = ssh_url[:-4]
-                            temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
-                except Exception as e:
-                    print(f'cloning error at {row[5]}')
-                    print(f'inside cloning error: {e}')
-                    raise ValueError(e)
-                os.chdir(temp_repo_path)
-                os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`")
-                os.chdir(cwd)
-                has_readme_bool, has_contributing_bool = False, False
-                for filename in os.listdir(temp_repo_path):
-                    if filename.startswith("README"):
-                        has_readme_bool = True
-                    if filename.startswith("CONTRIBUTING"):
-                        has_contributing_bool = True
-                if has_readme_bool or has_contributing_bool:
-                    commits_array = commit_analysis(temp_repo, from_date, to_date)
-                    commits_df = pd.DataFrame.from_records(commits_array)
-                    und_repo_id = '_'.join(repo_id.split("/"))
-                if has_readme_bool:
-                    has_readme += 1
-                    commits_df.to_csv(
-                        f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv",
-                        index=False,
-                        )
-                if has_contributing_bool:
-                    has_contributing += 1
-                    commits_df.to_csv(
-                        f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv",
-                        index=False,
-                        )
-            except Exception as e:
-                clone_error.append([row[5], e])
-                print(f"outside cloning error: {e}")
-            finally:
-                und_repo_id = ""
-                delete_clone(temp_dir)
-                os.chdir(cwd)
-
-            if index == stop_index:
-                break
-
-    print(clone_error)
-    with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file:
-            for error in clone_error:
-                txt_file.write(error + "\n")
-    with open(f"{stop_index}-success-output.txt", "w") as txt_file:
-        txt_file.write(f"Number of Empty Rows: {empty_row} \n")
-        txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
-        txt_file.write(f"Number that has README: {has_readme} \n")
-        txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
-    print(f"Number of Empty Rows: {empty_row}")
-    print(f"Number of Cloning Errors: {len(clone_error)}")
-    print(f"Number that has README: {has_readme}")
-    print(f"Number that has CONTRIBUTING: {has_contributing}")
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="chase validation")
-    parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search")
-    parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search")
-    args = parser.parse_args()
-    for_all_files(args.start_index, args.stop_index)
-    #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
-    #delete_clone(temp_dir)
\ No newline at end of file
diff --git a/12825_revision/pr_data_get.py b/12825_revision/pr_data_get.py
deleted file mode 100644
index 0969355..0000000
--- a/12825_revision/pr_data_get.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import csv
-from perceval.backends.core.git import Git
-import os
-import datetime as dt
-import time
-import shutil 
-import pandas as pd
-import dateutil
-from tqdm import tqdm
-import math
-
-key = os.environ.get('KKEXKEY')
-
-early_cutoff = dt.datetime(2008,2, 8)
-temp_dir = "/data/users/mgaughan/tmp/"
-
-'''
-- rate of change, rate of all/day
-'''
-def file_get_pr(upstream_vcs_link, me_read):
-    # if we're looking at readmes me_read is true and if not, if we're looking at contributing files, it's false
-    #this is the window of days on either side of the event that we're looking at
-    window = 182
-    #print(upstream_vcs_link.split('/')[4])
-    project_dict = {}
-    project_dict['upstream_vcs_link'] = upstream_vcs_link
-    upstream_vcs_link = upstream_vcs_link.strip()
-    if "github" in upstream_vcs_link or "gitlab" in upstream_vcs_link:
-        #making an evaluation that sub branches aren't being used and that people would fork if needed
-        #this only looks at main
-        upstream_vcs_link = "/".join(upstream_vcs_link.split("/")[0:5])
-        print(upstream_vcs_link)
-        full_temp_path = temp_dir + upstream_vcs_link.split('/')[4] + ".git"
-    else:
-        full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git"
-    print(upstream_vcs_link)
-    if upstream_vcs_link == "https://gitlab.com/ubports/core" or upstream_vcs_link == "https://gitlab.freedesktop.org/xorg/lib":
-        shutil.rmtree(full_temp_path, ignore_errors=True)
-        return {}
-    repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path)
-    try:
-        commits = repo.fetch()
-    except:
-        print("perceval issue")
-        return
-    has_readme = False
-    has_contributing = False
-    merge_pre_rm, merge_post_rm, merge_pre_cont, merge_post_cont = 0, 0, 0, 0
-    #list of tuples which has date and whether it was a merge
-    commit_list = []
-    first_date_readme = ""
-    for commit in commits:
-        #print(commit['data'])
-        if "Merge" in commit['data'].keys():
-            commit_list.append([commit['data']['CommitDate'], True, commit['data']['Author'], commit['data']['Commit']])
-            if has_contributing:
-                merge_post_cont += 1
-            else:
-                merge_pre_cont += 1
-        else: 
-            commit_list.append([commit['data']['CommitDate'], False, commit['data']['Author'], commit['data']['Commit']])
-        files = commit['data']['files']
-        #print(commit['data']['CommitDate'])
-        #print(type(dateutil.parser.parse(commit['data']['CommitDate'])))
-        for file in files:
-            if "CONTRIBUTING" in file['file'] and has_contributing == False:
-                has_contributing = True
-                first_date_contributing = dateutil.parser.parse(commit['data']['CommitDate'])
-            if "README" in file['file'] and has_readme == False:
-                has_readme = True
-                first_date_readme = dateutil.parser.parse(commit['data']['CommitDate'])
-                project_dict['readme_commit_hash'] = commit['data']['commit']
-    shutil.rmtree(full_temp_path, ignore_errors=True)
-    if first_date_readme == "":
-        return {}
-    if me_read:
-        project_dict['first_readme'] = first_date_readme
-        before_read = pr_count(first_date_readme+ dt.timedelta(days=-window, hours=0), first_date_readme, commit_list, [], [])
-        if before_read != None:
-            project_dict['before_allcom_read'] = before_read[0]
-            project_dict['before_mrg_read'] = before_read[1]
-            project_dict['before_auth_new'] = before_read[2]
-            project_dict['before_commit_new'] = before_read[3]
-        else:
-            return {}
-        after_read = pr_count(first_date_readme, first_date_readme + dt.timedelta(days=window, hours=0), commit_list, before_read[4], before_read[5])
-        if after_read != None:
-            project_dict['after_allcom_read'] = after_read[0]
-            project_dict['after_mrg_read'] = after_read[1]
-            project_dict['after_auth_new'] = after_read[2]
-            project_dict['after_commit_new'] = after_read[3]
-        else: 
-            return {}
-    else:
-        project_dict['first_contributing'] = first_date_contributing
-        before_cont = pr_count(first_date_contributing + dt.timedelta(days=-window, hours=0), first_date_contributing, commit_list, [], [])
-        if before_cont != None:
-            project_dict['before_allcom_cont'] = before_cont[0]
-            project_dict['before_mrg_cont'] = before_cont[1]
-            project_dict['before_auth_new'] = before_cont[2]
-            project_dict['before_commit_new'] = before_cont[3]
-        else:
-            return {}
-        after_cont = pr_count(first_date_contributing, first_date_contributing + dt.timedelta(days=window, hours=0), commit_list, before_cont[4], before_cont[5])
-        if after_cont != None:
-            project_dict['after_allcom_cont'] = after_cont[0]
-            project_dict['after_mrg_cont'] = after_cont[1]
-            project_dict['after_auth_new'] = after_cont[2]
-            project_dict['after_commit_new'] = after_cont[3]
-        else: 
-            return {}
-    print(project_dict)
-    return project_dict
-
-
-#TODO: pr_count should return an array of values for weekly/6mo
-def pr_count(start, end, commits, author_roster, commit_roster):
-    count = 0
-    merge_count = 0
-    by_week = [0] * 27
-    by_week_merge =[0] * 27
-    current_week = 0
-    new_authors = 0
-    new_committers = 0
-    for commit in tqdm(commits):
-        if dateutil.parser.parse(commit[0]) <= start:
-            if commit[2] not in author_roster:
-                author_roster.append(commit[2])
-            if commit[1] and commit[3] not in commit_roster:
-                commit_roster.append(commit[3])
-        if dateutil.parser.parse(commit[0]) > start:
-            if math.floor((dateutil.parser.parse(commit[0]) - start).days / 7) <= 26:
-                by_week[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
-                if commit[1]:
-                    by_week_merge[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1
-                    if commit[3] not in commit_roster:
-                        new_committers += 1
-                        #remaining question of whether to make this the author of the merge commit[2] or the committer of the merge commit[3]
-                        commit_roster.append(commit[3])
-                if commit[2] not in author_roster:
-                    new_authors += 1
-                    author_roster.append(commit[2])
-        if dateutil.parser.parse(commit[0]) > end:
-            print(len(by_week))
-            return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster]
-
-def for_files():
-    #csv_path = "final_data/deb_contribfile_roster.csv"
-    csv_path = "final_data/deb_readme_roster.csv"
-    count = 0 
-    with open(csv_path, 'r') as file:
-        csv_reader = csv.DictReader(file)
-        with open('kk_031624_pr_did.csv', "w") as writing_file:
-            # this would also have to get switched fro the cont dataset
-            keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
-            dict_writer = csv.DictWriter(writing_file, keys)
-            dict_writer.writeheader()
-            for row in csv_reader:
-                count += 1
-                print(row['upstream_vcs_link'])
-                # this would have to get switched to false for the cont dataset
-                try:
-                    dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True)
-                except:
-                    dict_row = {}
-                dict_writer.writerow(dict_row)
-    
-
-if __name__ == "__main__":
-    for_files()
-    #file_get_pr("https://github.com/tqdm/tqdm", True)
-    #file_get_pr("https://github.com/GameServerManagers/LinuxGSM", True)
-    #file_get_pr("https://github.com/walling/unorm/issues/new/", True)
-    #file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True)
-