From c0b4a905c61e7c05dc6f92a41529165af60872c3 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Sun, 18 May 2025 18:55:08 -0500 Subject: [PATCH] uncommitted changes --- 12825_revision/get_spec_file.py | 116 ------------- 12825_revision/intermediary_script.py | 224 -------------------------- 12825_revision/pr_data_get.py | 175 -------------------- 3 files changed, 515 deletions(-) delete mode 100644 12825_revision/get_spec_file.py delete mode 100644 12825_revision/intermediary_script.py delete mode 100644 12825_revision/pr_data_get.py diff --git a/12825_revision/get_spec_file.py b/12825_revision/get_spec_file.py deleted file mode 100644 index e0ee467..0000000 --- a/12825_revision/get_spec_file.py +++ /dev/null @@ -1,116 +0,0 @@ -import csv -from git import Repo -from perceval.backends.core.git import Git -import os -import datetime as dt -import time -import shutil -import pandas as pd -import dateutil -from tqdm import tqdm -import math -import io -import re - -working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2" -temp_dir = "/data/users/mgaughan/tmp3/" - -# getting the specific readme or contributing file from a given commit -# inputs: upstream vcs link, commit hash, yes/no is it a readme -def get_file(vcs_link, commit_hash, is_readme): - if "github" in vcs_link or "gitlab" in vcs_link: - #making an evaluation that sub branches aren't being used and that people would fork if needed - #this only looks at main - vcs_link = "/".join(vcs_link.split("/")[0:5]) - full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git" - other_temp_path = temp_dir + vcs_link.split('/')[4] + ".git0" - else: - full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git" - other_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git0" - repo0 = Repo.clone_from(vcs_link, full_temp_path) - repo = Git(uri=vcs_link, gitpath=other_temp_path) - commit0 = repo0.commit(commit_hash) - commits = repo.fetch() - target_filename = "" - for commit in commits: - files = commit['data']['files'] - for file in files: - if is_readme: - if "README" in file['file']: - #print(file['file']) - if "/" in file['file']: - target_filename = file['file'].split("/")[-1] - else: - target_filename = file['file'] - else: - if "CONTRIBUTING" in file['file']: - if "/" in file['file']: - target_filename = file['file'].split("/")[-1] - else: - target_filename = file['file'] - - if target_filename == "": - shutil.rmtree(full_temp_path, ignore_errors=True) - shutil.rmtree(other_temp_path, ignore_errors=True) - return "NoFile" - - targetfile = "" - for blob in commit0.tree.blobs: - #print(type(blob.path)) - if "CONTRIBUTING" in blob.path: - targetfile = blob - #print(blob.path) - # why would a file not be in the commit tree? but would be in the directory? - #shutil.rmtree(full_temp_path, ignore_errors=True) - # return "KeyError -- the file is not in the commit tree" - if targetfile == "": - shutil.rmtree(full_temp_path, ignore_errors=True) - shutil.rmtree(other_temp_path, ignore_errors=True) - return "KeyError -- the file is not in the commit tree" - if is_readme: - last_path = "readme2" - else: - last_path = "contributing2" - with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file: - with io.BytesIO(targetfile.data_stream.read()) as f: - file.write(f.read().decode('utf-8', errors='ignore')) - #file.write(f.read()) - file.close() - shutil.rmtree(full_temp_path, ignore_errors=True) - shutil.rmtree(other_temp_path, ignore_errors=True) - return "NoError" - -def for_all_files(): - #toggle this based on readme or contributing files - readme_is = False - csv_path = "final_data/deb_contrib_did_data.csv" - index = -1 - saved = [] - with open(csv_path, 'r') as file: - with open('e_031824_spec_errors.csv', "w") as writing_file: - csv_writer = csv.writer(writing_file) - with open("contributing_completed_downloads.csv", "w") as writing_file2: - csv_writer2 = csv.writer(writing_file2) - #csv_reader = csv.DictReader(file) - lines = [line for line in file] - for row in tqdm(csv.reader(lines), total=len(lines)): - index += 1 - if index == 0: - continue - if row[0] == "": - continue - #print(row[0]) - return_value = get_file(row[0], row[2], readme_is) - if return_value != "NoError": - csv_writer.writerow([row[0], row[2], readme_is, return_value]) - else: - if row[0] in saved: - continue - saved.append(row[0]) - csv_writer2.writerow(row) - # if it is noError, just write the row down in a different csv - # there's an issue of duplicates, but just keep it moving - # if no duplicates -- just run it through - -if __name__ == "__main__": - for_all_files() diff --git a/12825_revision/intermediary_script.py b/12825_revision/intermediary_script.py deleted file mode 100644 index b41b324..0000000 --- a/12825_revision/intermediary_script.py +++ /dev/null @@ -1,224 +0,0 @@ -import git -from tqdm import tqdm -import csv -import os -import shutil -import time -import pandas as pd -import datetime -import argparse - -temp_dir = "/data/users/mgaughan/tmp3/" -cst = datetime.timezone(datetime.timedelta(hours=-6)) -from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst) -to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst) -COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/" - -def temp_clone(vcs_link, temp_location): - """ - ARGS - vcs_link : url link to upstream repo vcs - temp_location : filepath to where the repo should be cloned to - - RETURNS - repo : the GitRepository object of the cloned repo - repo_path : the filepath to the cloned repository - """ - #print(temp_location) - vcs_link = vcs_link.strip() - os.makedirs(temp_location) - repo_path = temp_location - repo = git.Repo.clone_from(vcs_link, repo_path) - print(f"Successfully Cloned {vcs_link}") - return repo, repo_path - - -def delete_clone(temp_location): - """ - ARGS - temp_location : filepath to the cloned repository - - RETURNS - whether or not the deletion was a success - """ - if os.path.exists(temp_location): - shutil.rmtree(temp_location) - print(f"{temp_location} has been deleted.") - return 0 - else: - print("No clone at location") - return 1 - -# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments -def commit_analysis(repo, cutoff_date, start_date): - print("Analyzing Commits...") - commits_info = [] - for commit in repo.iter_commits(): - # if too far back, break - if commit.committed_datetime > start_date: - continue - if commit.committed_datetime < cutoff_date: - break - commit_info = { - "commit_hash": commit.hexsha, - "author_name": commit.author.name, - "author_email": commit.author.email, - "authored_date": commit.authored_datetime, - "committer_name": commit.committer.name, - "committer_email": commit.committer.email, - "commit_date": commit.committed_datetime, - "message": commit.message, - "is_merge": len(commit.parents) > 1, - } - # author/committer org information - commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0] - commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0] - # some more effort to get this information - commit_info["branches"] = repo.git.branch( - "--contains", commit_info["commit_hash"] - ) - # diff information - diffs = commit.diff( - commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True - ) - commit_info["diff_info"] = diff_analysis(diffs) - # print(commit_info) - commits_info.append(commit_info) - return commits_info - - -def diff_analysis(diffs): - diff_objects = [] - for diff in diffs: - diff_info = { - "lines_added": sum( - 1 - for line in diff.diff.decode("utf-8").split("\n") - if line.startswith("+") and not line.startswith("+++") - ), - "lines_deleted": sum( - 1 - for line in diff.diff.decode("utf-8").split("\n") - if line.startswith("-") and not line.startswith("---") - ), - "parent_filepath": diff.a_path, - "child_filepath": diff.b_path, - "change_type": diff.change_type, - "new_file": diff.new_file, - "deleted_file": diff.deleted_file, - "renamed_file": diff.renamed, - #'diff': diff.diff.decode('utf-8') - } - diff_objects.append(diff_info) - return diff_objects - -def for_all_files(start_index, stop_index): - cwd = os.getcwd() - csv_path = "../final_data/deb_full_data.csv" - index = -1 - saved = [] - empty_row = 0 - clone_error =[] - has_readme = 0 - has_contributing = 0 - with open(csv_path, 'r') as file: - csv_reader = csv.DictReader(file) - lines = [line for line in file] - for row in tqdm(csv.reader(lines), total=len(lines)): - index += 1 - #time.sleep(5) - if index < start_index: - continue - if row[0] == "": - empty_row += 1 - continue - #row[5] = upstream vcs - temp_repo_path = "" - und_repo_id = "" - try: - os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no' - os.environ['GIT_ASKPASS'] = 'false' - os.environ['GIT_TERMINAL_PROMPT'] = '0' - ssh_url = "" - try: - if "github" in row[5]: - repo_id = row[5][len('https://github.com/'):] - ssh_url = f'git@github.com:{repo_id}.git' - if ssh_url.endswith('.git.git'): - ssh_url = ssh_url[:-4] - temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir) - else: - parts = row[5].split('/') - domain = parts[2] - repo_id = '/'.join(parts[3:]) - try: - temp_repo, temp_repo_path = temp_clone(row[5], temp_dir) - except Exception as e: - print(f'non-Github cloning error, assuming HTTPS issue: {e}') - delete_clone(temp_dir) - ssh_url = f'git@{domain}:{repo_id}.git' - if ssh_url.endswith('.git.git'): - ssh_url = ssh_url[:-4] - temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir) - except Exception as e: - print(f'cloning error at {row[5]}') - print(f'inside cloning error: {e}') - raise ValueError(e) - os.chdir(temp_repo_path) - os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`") - os.chdir(cwd) - has_readme_bool, has_contributing_bool = False, False - for filename in os.listdir(temp_repo_path): - if filename.startswith("README"): - has_readme_bool = True - if filename.startswith("CONTRIBUTING"): - has_contributing_bool = True - if has_readme_bool or has_contributing_bool: - commits_array = commit_analysis(temp_repo, from_date, to_date) - commits_df = pd.DataFrame.from_records(commits_array) - und_repo_id = '_'.join(repo_id.split("/")) - if has_readme_bool: - has_readme += 1 - commits_df.to_csv( - f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv", - index=False, - ) - if has_contributing_bool: - has_contributing += 1 - commits_df.to_csv( - f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv", - index=False, - ) - except Exception as e: - clone_error.append([row[5], e]) - print(f"outside cloning error: {e}") - finally: - und_repo_id = "" - delete_clone(temp_dir) - os.chdir(cwd) - - if index == stop_index: - break - - print(clone_error) - with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file: - for error in clone_error: - txt_file.write(error + "\n") - with open(f"{stop_index}-success-output.txt", "w") as txt_file: - txt_file.write(f"Number of Empty Rows: {empty_row} \n") - txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n") - txt_file.write(f"Number that has README: {has_readme} \n") - txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}") - print(f"Number of Empty Rows: {empty_row}") - print(f"Number of Cloning Errors: {len(clone_error)}") - print(f"Number that has README: {has_readme}") - print(f"Number that has CONTRIBUTING: {has_contributing}") - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="chase validation") - parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search") - parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search") - args = parser.parse_args() - for_all_files(args.start_index, args.stop_index) - #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir) - #delete_clone(temp_dir) \ No newline at end of file diff --git a/12825_revision/pr_data_get.py b/12825_revision/pr_data_get.py deleted file mode 100644 index 0969355..0000000 --- a/12825_revision/pr_data_get.py +++ /dev/null @@ -1,175 +0,0 @@ -import csv -from perceval.backends.core.git import Git -import os -import datetime as dt -import time -import shutil -import pandas as pd -import dateutil -from tqdm import tqdm -import math - -key = os.environ.get('KKEXKEY') - -early_cutoff = dt.datetime(2008,2, 8) -temp_dir = "/data/users/mgaughan/tmp/" - -''' -- rate of change, rate of all/day -''' -def file_get_pr(upstream_vcs_link, me_read): - # if we're looking at readmes me_read is true and if not, if we're looking at contributing files, it's false - #this is the window of days on either side of the event that we're looking at - window = 182 - #print(upstream_vcs_link.split('/')[4]) - project_dict = {} - project_dict['upstream_vcs_link'] = upstream_vcs_link - upstream_vcs_link = upstream_vcs_link.strip() - if "github" in upstream_vcs_link or "gitlab" in upstream_vcs_link: - #making an evaluation that sub branches aren't being used and that people would fork if needed - #this only looks at main - upstream_vcs_link = "/".join(upstream_vcs_link.split("/")[0:5]) - print(upstream_vcs_link) - full_temp_path = temp_dir + upstream_vcs_link.split('/')[4] + ".git" - else: - full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git" - print(upstream_vcs_link) - if upstream_vcs_link == "https://gitlab.com/ubports/core" or upstream_vcs_link == "https://gitlab.freedesktop.org/xorg/lib": - shutil.rmtree(full_temp_path, ignore_errors=True) - return {} - repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path) - try: - commits = repo.fetch() - except: - print("perceval issue") - return - has_readme = False - has_contributing = False - merge_pre_rm, merge_post_rm, merge_pre_cont, merge_post_cont = 0, 0, 0, 0 - #list of tuples which has date and whether it was a merge - commit_list = [] - first_date_readme = "" - for commit in commits: - #print(commit['data']) - if "Merge" in commit['data'].keys(): - commit_list.append([commit['data']['CommitDate'], True, commit['data']['Author'], commit['data']['Commit']]) - if has_contributing: - merge_post_cont += 1 - else: - merge_pre_cont += 1 - else: - commit_list.append([commit['data']['CommitDate'], False, commit['data']['Author'], commit['data']['Commit']]) - files = commit['data']['files'] - #print(commit['data']['CommitDate']) - #print(type(dateutil.parser.parse(commit['data']['CommitDate']))) - for file in files: - if "CONTRIBUTING" in file['file'] and has_contributing == False: - has_contributing = True - first_date_contributing = dateutil.parser.parse(commit['data']['CommitDate']) - if "README" in file['file'] and has_readme == False: - has_readme = True - first_date_readme = dateutil.parser.parse(commit['data']['CommitDate']) - project_dict['readme_commit_hash'] = commit['data']['commit'] - shutil.rmtree(full_temp_path, ignore_errors=True) - if first_date_readme == "": - return {} - if me_read: - project_dict['first_readme'] = first_date_readme - before_read = pr_count(first_date_readme+ dt.timedelta(days=-window, hours=0), first_date_readme, commit_list, [], []) - if before_read != None: - project_dict['before_allcom_read'] = before_read[0] - project_dict['before_mrg_read'] = before_read[1] - project_dict['before_auth_new'] = before_read[2] - project_dict['before_commit_new'] = before_read[3] - else: - return {} - after_read = pr_count(first_date_readme, first_date_readme + dt.timedelta(days=window, hours=0), commit_list, before_read[4], before_read[5]) - if after_read != None: - project_dict['after_allcom_read'] = after_read[0] - project_dict['after_mrg_read'] = after_read[1] - project_dict['after_auth_new'] = after_read[2] - project_dict['after_commit_new'] = after_read[3] - else: - return {} - else: - project_dict['first_contributing'] = first_date_contributing - before_cont = pr_count(first_date_contributing + dt.timedelta(days=-window, hours=0), first_date_contributing, commit_list, [], []) - if before_cont != None: - project_dict['before_allcom_cont'] = before_cont[0] - project_dict['before_mrg_cont'] = before_cont[1] - project_dict['before_auth_new'] = before_cont[2] - project_dict['before_commit_new'] = before_cont[3] - else: - return {} - after_cont = pr_count(first_date_contributing, first_date_contributing + dt.timedelta(days=window, hours=0), commit_list, before_cont[4], before_cont[5]) - if after_cont != None: - project_dict['after_allcom_cont'] = after_cont[0] - project_dict['after_mrg_cont'] = after_cont[1] - project_dict['after_auth_new'] = after_cont[2] - project_dict['after_commit_new'] = after_cont[3] - else: - return {} - print(project_dict) - return project_dict - - -#TODO: pr_count should return an array of values for weekly/6mo -def pr_count(start, end, commits, author_roster, commit_roster): - count = 0 - merge_count = 0 - by_week = [0] * 27 - by_week_merge =[0] * 27 - current_week = 0 - new_authors = 0 - new_committers = 0 - for commit in tqdm(commits): - if dateutil.parser.parse(commit[0]) <= start: - if commit[2] not in author_roster: - author_roster.append(commit[2]) - if commit[1] and commit[3] not in commit_roster: - commit_roster.append(commit[3]) - if dateutil.parser.parse(commit[0]) > start: - if math.floor((dateutil.parser.parse(commit[0]) - start).days / 7) <= 26: - by_week[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1 - if commit[1]: - by_week_merge[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1 - if commit[3] not in commit_roster: - new_committers += 1 - #remaining question of whether to make this the author of the merge commit[2] or the committer of the merge commit[3] - commit_roster.append(commit[3]) - if commit[2] not in author_roster: - new_authors += 1 - author_roster.append(commit[2]) - if dateutil.parser.parse(commit[0]) > end: - print(len(by_week)) - return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster] - -def for_files(): - #csv_path = "final_data/deb_contribfile_roster.csv" - csv_path = "final_data/deb_readme_roster.csv" - count = 0 - with open(csv_path, 'r') as file: - csv_reader = csv.DictReader(file) - with open('kk_031624_pr_did.csv', "w") as writing_file: - # this would also have to get switched fro the cont dataset - keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new'] - dict_writer = csv.DictWriter(writing_file, keys) - dict_writer.writeheader() - for row in csv_reader: - count += 1 - print(row['upstream_vcs_link']) - # this would have to get switched to false for the cont dataset - try: - dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True) - except: - dict_row = {} - dict_writer.writerow(dict_row) - - -if __name__ == "__main__": - for_files() - #file_get_pr("https://github.com/tqdm/tqdm", True) - #file_get_pr("https://github.com/GameServerManagers/LinuxGSM", True) - #file_get_pr("https://github.com/walling/unorm/issues/new/", True) - #file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True) -