From 2d9ce17e3a5004622ca7e5e837500fc2883dad82 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 28 Jan 2025 23:04:51 -0600 Subject: [PATCH] backing up with revision --- 12825_revision/get_spec_file.py | 116 +++++++++++++ 12825_revision/intermediary_script.py | 224 ++++++++++++++++++++++++++ 12825_revision/pr_data_get.py | 175 ++++++++++++++++++++ 3 files changed, 515 insertions(+) create mode 100644 12825_revision/get_spec_file.py create mode 100644 12825_revision/intermediary_script.py create mode 100644 12825_revision/pr_data_get.py diff --git a/12825_revision/get_spec_file.py b/12825_revision/get_spec_file.py new file mode 100644 index 0000000..e0ee467 --- /dev/null +++ b/12825_revision/get_spec_file.py @@ -0,0 +1,116 @@ +import csv +from git import Repo +from perceval.backends.core.git import Git +import os +import datetime as dt +import time +import shutil +import pandas as pd +import dateutil +from tqdm import tqdm +import math +import io +import re + +working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2" +temp_dir = "/data/users/mgaughan/tmp3/" + +# getting the specific readme or contributing file from a given commit +# inputs: upstream vcs link, commit hash, yes/no is it a readme +def get_file(vcs_link, commit_hash, is_readme): + if "github" in vcs_link or "gitlab" in vcs_link: + #making an evaluation that sub branches aren't being used and that people would fork if needed + #this only looks at main + vcs_link = "/".join(vcs_link.split("/")[0:5]) + full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git" + other_temp_path = temp_dir + vcs_link.split('/')[4] + ".git0" + else: + full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git" + other_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git0" + repo0 = Repo.clone_from(vcs_link, full_temp_path) + repo = Git(uri=vcs_link, gitpath=other_temp_path) + commit0 = repo0.commit(commit_hash) + commits = repo.fetch() + target_filename = "" + for commit in commits: + files = commit['data']['files'] + for file in files: + if is_readme: + if "README" in file['file']: + #print(file['file']) + if "/" in file['file']: + target_filename = file['file'].split("/")[-1] + else: + target_filename = file['file'] + else: + if "CONTRIBUTING" in file['file']: + if "/" in file['file']: + target_filename = file['file'].split("/")[-1] + else: + target_filename = file['file'] + + if target_filename == "": + shutil.rmtree(full_temp_path, ignore_errors=True) + shutil.rmtree(other_temp_path, ignore_errors=True) + return "NoFile" + + targetfile = "" + for blob in commit0.tree.blobs: + #print(type(blob.path)) + if "CONTRIBUTING" in blob.path: + targetfile = blob + #print(blob.path) + # why would a file not be in the commit tree? but would be in the directory? + #shutil.rmtree(full_temp_path, ignore_errors=True) + # return "KeyError -- the file is not in the commit tree" + if targetfile == "": + shutil.rmtree(full_temp_path, ignore_errors=True) + shutil.rmtree(other_temp_path, ignore_errors=True) + return "KeyError -- the file is not in the commit tree" + if is_readme: + last_path = "readme2" + else: + last_path = "contributing2" + with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file: + with io.BytesIO(targetfile.data_stream.read()) as f: + file.write(f.read().decode('utf-8', errors='ignore')) + #file.write(f.read()) + file.close() + shutil.rmtree(full_temp_path, ignore_errors=True) + shutil.rmtree(other_temp_path, ignore_errors=True) + return "NoError" + +def for_all_files(): + #toggle this based on readme or contributing files + readme_is = False + csv_path = "final_data/deb_contrib_did_data.csv" + index = -1 + saved = [] + with open(csv_path, 'r') as file: + with open('e_031824_spec_errors.csv', "w") as writing_file: + csv_writer = csv.writer(writing_file) + with open("contributing_completed_downloads.csv", "w") as writing_file2: + csv_writer2 = csv.writer(writing_file2) + #csv_reader = csv.DictReader(file) + lines = [line for line in file] + for row in tqdm(csv.reader(lines), total=len(lines)): + index += 1 + if index == 0: + continue + if row[0] == "": + continue + #print(row[0]) + return_value = get_file(row[0], row[2], readme_is) + if return_value != "NoError": + csv_writer.writerow([row[0], row[2], readme_is, return_value]) + else: + if row[0] in saved: + continue + saved.append(row[0]) + csv_writer2.writerow(row) + # if it is noError, just write the row down in a different csv + # there's an issue of duplicates, but just keep it moving + # if no duplicates -- just run it through + +if __name__ == "__main__": + for_all_files() diff --git a/12825_revision/intermediary_script.py b/12825_revision/intermediary_script.py new file mode 100644 index 0000000..b41b324 --- /dev/null +++ b/12825_revision/intermediary_script.py @@ -0,0 +1,224 @@ +import git +from tqdm import tqdm +import csv +import os +import shutil +import time +import pandas as pd +import datetime +import argparse + +temp_dir = "/data/users/mgaughan/tmp3/" +cst = datetime.timezone(datetime.timedelta(hours=-6)) +from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst) +to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst) +COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/" + +def temp_clone(vcs_link, temp_location): + """ + ARGS + vcs_link : url link to upstream repo vcs + temp_location : filepath to where the repo should be cloned to + + RETURNS + repo : the GitRepository object of the cloned repo + repo_path : the filepath to the cloned repository + """ + #print(temp_location) + vcs_link = vcs_link.strip() + os.makedirs(temp_location) + repo_path = temp_location + repo = git.Repo.clone_from(vcs_link, repo_path) + print(f"Successfully Cloned {vcs_link}") + return repo, repo_path + + +def delete_clone(temp_location): + """ + ARGS + temp_location : filepath to the cloned repository + + RETURNS + whether or not the deletion was a success + """ + if os.path.exists(temp_location): + shutil.rmtree(temp_location) + print(f"{temp_location} has been deleted.") + return 0 + else: + print("No clone at location") + return 1 + +# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments +def commit_analysis(repo, cutoff_date, start_date): + print("Analyzing Commits...") + commits_info = [] + for commit in repo.iter_commits(): + # if too far back, break + if commit.committed_datetime > start_date: + continue + if commit.committed_datetime < cutoff_date: + break + commit_info = { + "commit_hash": commit.hexsha, + "author_name": commit.author.name, + "author_email": commit.author.email, + "authored_date": commit.authored_datetime, + "committer_name": commit.committer.name, + "committer_email": commit.committer.email, + "commit_date": commit.committed_datetime, + "message": commit.message, + "is_merge": len(commit.parents) > 1, + } + # author/committer org information + commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0] + commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0] + # some more effort to get this information + commit_info["branches"] = repo.git.branch( + "--contains", commit_info["commit_hash"] + ) + # diff information + diffs = commit.diff( + commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True + ) + commit_info["diff_info"] = diff_analysis(diffs) + # print(commit_info) + commits_info.append(commit_info) + return commits_info + + +def diff_analysis(diffs): + diff_objects = [] + for diff in diffs: + diff_info = { + "lines_added": sum( + 1 + for line in diff.diff.decode("utf-8").split("\n") + if line.startswith("+") and not line.startswith("+++") + ), + "lines_deleted": sum( + 1 + for line in diff.diff.decode("utf-8").split("\n") + if line.startswith("-") and not line.startswith("---") + ), + "parent_filepath": diff.a_path, + "child_filepath": diff.b_path, + "change_type": diff.change_type, + "new_file": diff.new_file, + "deleted_file": diff.deleted_file, + "renamed_file": diff.renamed, + #'diff': diff.diff.decode('utf-8') + } + diff_objects.append(diff_info) + return diff_objects + +def for_all_files(start_index, stop_index): + cwd = os.getcwd() + csv_path = "../final_data/deb_full_data.csv" + index = -1 + saved = [] + empty_row = 0 + clone_error =[] + has_readme = 0 + has_contributing = 0 + with open(csv_path, 'r') as file: + csv_reader = csv.DictReader(file) + lines = [line for line in file] + for row in tqdm(csv.reader(lines), total=len(lines)): + index += 1 + #time.sleep(5) + if index < start_index: + continue + if row[0] == "": + empty_row += 1 + continue + #row[5] = upstream vcs + temp_repo_path = "" + und_repo_id = "" + try: + os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no' + os.environ['GIT_ASKPASS'] = 'false' + os.environ['GIT_TERMINAL_PROMPT'] = '0' + ssh_url = "" + try: + if "github" in row[5]: + repo_id = row[5][len('https://github.com/'):] + ssh_url = f'git@github.com:{repo_id}.git' + if ssh_url.endswith('.git.git'): + ssh_url = ssh_url[:-4] + temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir) + else: + parts = row[5].split('/') + domain = parts[2] + repo_id = '/'.join(parts[3:]) + try: + temp_repo, temp_repo_path = temp_clone(row[5], temp_dir) + except Exception as e: + print(f'non-Github cloning error, assuming HTTPS issue: {e}') + delete_clone(temp_dir) + ssh_url = f'git@{domain}:{repo_id}.git' + if ssh_url.endswith('.git.git'): + ssh_url = ssh_url[:-4] + temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir) + except Exception as e: + print(f'cloning error at {row[5]}') + print(f'inside cloning error: {e}') + raise ValueError(e) + os.chdir(temp_repo_path) + os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`") + os.chdir(cwd) + has_readme_bool, has_contributing_bool = False, False + for filename in os.listdir(temp_repo_path): + if filename.startswith("README"): + has_readme_bool = True + if filename.startswith("CONTRIBUTING"): + has_contributing_bool = True + if has_readme_bool or has_contributing_bool: + commits_array = commit_analysis(temp_repo, from_date, to_date) + commits_df = pd.DataFrame.from_records(commits_array) + und_repo_id = '_'.join(repo_id.split("/")) + if has_readme_bool: + has_readme += 1 + commits_df.to_csv( + f"{COMMIT_SAVE_PREFIX}readme_commit_data/{und_repo_id}_commits.csv", + index=False, + ) + if has_contributing_bool: + has_contributing += 1 + commits_df.to_csv( + f"{COMMIT_SAVE_PREFIX}contributing_commit_data/{und_repo_id}_commits.csv", + index=False, + ) + except Exception as e: + clone_error.append([row[5], e]) + print(f"outside cloning error: {e}") + finally: + und_repo_id = "" + delete_clone(temp_dir) + os.chdir(cwd) + + if index == stop_index: + break + + print(clone_error) + with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file: + for error in clone_error: + txt_file.write(error + "\n") + with open(f"{stop_index}-success-output.txt", "w") as txt_file: + txt_file.write(f"Number of Empty Rows: {empty_row} \n") + txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n") + txt_file.write(f"Number that has README: {has_readme} \n") + txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}") + print(f"Number of Empty Rows: {empty_row}") + print(f"Number of Cloning Errors: {len(clone_error)}") + print(f"Number that has README: {has_readme}") + print(f"Number that has CONTRIBUTING: {has_contributing}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="chase validation") + parser.add_argument("--start_index", type=int, required=True, help="The starting index for the search") + parser.add_argument("--stop_index", type=int, required=True, help="The stopping index for the search") + args = parser.parse_args() + for_all_files(args.start_index, args.stop_index) + #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir) + #delete_clone(temp_dir) \ No newline at end of file diff --git a/12825_revision/pr_data_get.py b/12825_revision/pr_data_get.py new file mode 100644 index 0000000..0969355 --- /dev/null +++ b/12825_revision/pr_data_get.py @@ -0,0 +1,175 @@ +import csv +from perceval.backends.core.git import Git +import os +import datetime as dt +import time +import shutil +import pandas as pd +import dateutil +from tqdm import tqdm +import math + +key = os.environ.get('KKEXKEY') + +early_cutoff = dt.datetime(2008,2, 8) +temp_dir = "/data/users/mgaughan/tmp/" + +''' +- rate of change, rate of all/day +''' +def file_get_pr(upstream_vcs_link, me_read): + # if we're looking at readmes me_read is true and if not, if we're looking at contributing files, it's false + #this is the window of days on either side of the event that we're looking at + window = 182 + #print(upstream_vcs_link.split('/')[4]) + project_dict = {} + project_dict['upstream_vcs_link'] = upstream_vcs_link + upstream_vcs_link = upstream_vcs_link.strip() + if "github" in upstream_vcs_link or "gitlab" in upstream_vcs_link: + #making an evaluation that sub branches aren't being used and that people would fork if needed + #this only looks at main + upstream_vcs_link = "/".join(upstream_vcs_link.split("/")[0:5]) + print(upstream_vcs_link) + full_temp_path = temp_dir + upstream_vcs_link.split('/')[4] + ".git" + else: + full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git" + print(upstream_vcs_link) + if upstream_vcs_link == "https://gitlab.com/ubports/core" or upstream_vcs_link == "https://gitlab.freedesktop.org/xorg/lib": + shutil.rmtree(full_temp_path, ignore_errors=True) + return {} + repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path) + try: + commits = repo.fetch() + except: + print("perceval issue") + return + has_readme = False + has_contributing = False + merge_pre_rm, merge_post_rm, merge_pre_cont, merge_post_cont = 0, 0, 0, 0 + #list of tuples which has date and whether it was a merge + commit_list = [] + first_date_readme = "" + for commit in commits: + #print(commit['data']) + if "Merge" in commit['data'].keys(): + commit_list.append([commit['data']['CommitDate'], True, commit['data']['Author'], commit['data']['Commit']]) + if has_contributing: + merge_post_cont += 1 + else: + merge_pre_cont += 1 + else: + commit_list.append([commit['data']['CommitDate'], False, commit['data']['Author'], commit['data']['Commit']]) + files = commit['data']['files'] + #print(commit['data']['CommitDate']) + #print(type(dateutil.parser.parse(commit['data']['CommitDate']))) + for file in files: + if "CONTRIBUTING" in file['file'] and has_contributing == False: + has_contributing = True + first_date_contributing = dateutil.parser.parse(commit['data']['CommitDate']) + if "README" in file['file'] and has_readme == False: + has_readme = True + first_date_readme = dateutil.parser.parse(commit['data']['CommitDate']) + project_dict['readme_commit_hash'] = commit['data']['commit'] + shutil.rmtree(full_temp_path, ignore_errors=True) + if first_date_readme == "": + return {} + if me_read: + project_dict['first_readme'] = first_date_readme + before_read = pr_count(first_date_readme+ dt.timedelta(days=-window, hours=0), first_date_readme, commit_list, [], []) + if before_read != None: + project_dict['before_allcom_read'] = before_read[0] + project_dict['before_mrg_read'] = before_read[1] + project_dict['before_auth_new'] = before_read[2] + project_dict['before_commit_new'] = before_read[3] + else: + return {} + after_read = pr_count(first_date_readme, first_date_readme + dt.timedelta(days=window, hours=0), commit_list, before_read[4], before_read[5]) + if after_read != None: + project_dict['after_allcom_read'] = after_read[0] + project_dict['after_mrg_read'] = after_read[1] + project_dict['after_auth_new'] = after_read[2] + project_dict['after_commit_new'] = after_read[3] + else: + return {} + else: + project_dict['first_contributing'] = first_date_contributing + before_cont = pr_count(first_date_contributing + dt.timedelta(days=-window, hours=0), first_date_contributing, commit_list, [], []) + if before_cont != None: + project_dict['before_allcom_cont'] = before_cont[0] + project_dict['before_mrg_cont'] = before_cont[1] + project_dict['before_auth_new'] = before_cont[2] + project_dict['before_commit_new'] = before_cont[3] + else: + return {} + after_cont = pr_count(first_date_contributing, first_date_contributing + dt.timedelta(days=window, hours=0), commit_list, before_cont[4], before_cont[5]) + if after_cont != None: + project_dict['after_allcom_cont'] = after_cont[0] + project_dict['after_mrg_cont'] = after_cont[1] + project_dict['after_auth_new'] = after_cont[2] + project_dict['after_commit_new'] = after_cont[3] + else: + return {} + print(project_dict) + return project_dict + + +#TODO: pr_count should return an array of values for weekly/6mo +def pr_count(start, end, commits, author_roster, commit_roster): + count = 0 + merge_count = 0 + by_week = [0] * 27 + by_week_merge =[0] * 27 + current_week = 0 + new_authors = 0 + new_committers = 0 + for commit in tqdm(commits): + if dateutil.parser.parse(commit[0]) <= start: + if commit[2] not in author_roster: + author_roster.append(commit[2]) + if commit[1] and commit[3] not in commit_roster: + commit_roster.append(commit[3]) + if dateutil.parser.parse(commit[0]) > start: + if math.floor((dateutil.parser.parse(commit[0]) - start).days / 7) <= 26: + by_week[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1 + if commit[1]: + by_week_merge[math.floor((dateutil.parser.parse(commit[0]) - start).days / 7)] += 1 + if commit[3] not in commit_roster: + new_committers += 1 + #remaining question of whether to make this the author of the merge commit[2] or the committer of the merge commit[3] + commit_roster.append(commit[3]) + if commit[2] not in author_roster: + new_authors += 1 + author_roster.append(commit[2]) + if dateutil.parser.parse(commit[0]) > end: + print(len(by_week)) + return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster] + +def for_files(): + #csv_path = "final_data/deb_contribfile_roster.csv" + csv_path = "final_data/deb_readme_roster.csv" + count = 0 + with open(csv_path, 'r') as file: + csv_reader = csv.DictReader(file) + with open('kk_031624_pr_did.csv', "w") as writing_file: + # this would also have to get switched fro the cont dataset + keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new'] + dict_writer = csv.DictWriter(writing_file, keys) + dict_writer.writeheader() + for row in csv_reader: + count += 1 + print(row['upstream_vcs_link']) + # this would have to get switched to false for the cont dataset + try: + dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True) + except: + dict_row = {} + dict_writer.writerow(dict_row) + + +if __name__ == "__main__": + for_files() + #file_get_pr("https://github.com/tqdm/tqdm", True) + #file_get_pr("https://github.com/GameServerManagers/LinuxGSM", True) + #file_get_pr("https://github.com/walling/unorm/issues/new/", True) + #file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True) +