import os, sys import json import numpy as np import pandas as pd import requests import re import datetime import time from tqdm import tqdm #from urllib.parse import quote_plus #from requests.auth import HTTPDigestAuth #from pygerrit2 import GerritRestAPI, HTTPBasicAuth # format of the Gerrit links: https://gerrit.wikimedia.org/r/85783 # but needs to go in like this https://gerrit.wikimedia.org/r/changes/85783 # curl https://gerrit.wikimedia.org/r/changes/Ic69c2ad275389a31c9fbaf47f3665dcdbb7ac2af/detail def query_change_detail( written_url, sleep = 10 ): time.sleep(sleep) short_change_id = written_url.split("/")[-1] username = os.environ.get("GERRIT_USERNAME") http_password = os.environ.get("GERRIT_HTTP_PASSWORD") api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail" response = requests.get(api_url, auth=(username, http_password), headers={'Content-Type': 'application/json'}) result = json.loads(response.text[5:]) select_change_dict = {} #making note of what the url was in the message select_change_dict['written_url_in_message'] = written_url result['written_url_in_message'] = written_url #getting ID select_change_dict['id'] = result['change_id'] select_change_dict['project'] = result['project'] select_change_dict['description'] = result['subject'] #getting owner name and email select_change_dict['owner_dict'] = result['owner'] select_change_dict['owner_email'] = result['owner']['email'] #current revision count select_change_dict['revision_count'] = result['current_revision_number'] #insertions select_change_dict['code_insertions'] = result['insertions'] #deletions select_change_dict['code_deletions'] = result['deletions'] #status select_change_dict['status'] = result['status'] #reviewers select_change_dict['reviewer_count'] = len(result['reviewers']['REVIEWER']) select_change_dict['reviewers'] = result['reviewers']['REVIEWER'] print(result) print(select_change_dict) return [select_change_dict, result] def query_gerrit_changes(df, sleep=25): tqdm.pandas() #get the information from the Gerrit change from the URL that's written in the message def get_change_details(written_url): time.sleep(sleep) #short_change_id = written_url.split("/")[-1] match = re.search(r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?([0-9]+)(?:/|$)", written_url) if match: short_change_id = match.group(1) else: print(f"Error: No change ID found in URL: {written_url}") return {"written_url_in_message": written_url, "full_result": None} api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail" username = os.environ.get("GERRIT_USERNAME") http_password = os.environ.get("GERRIT_HTTP_PASSWORD") response = requests.get(api_url, auth=(username, http_password), headers={'Content-Type': 'application/json'}) if response.status_code == 200: try: result = json.loads(response.text[5:]) return {"written_url_in_message": written_url, "full_result": result} except Exception as e: print("JSON decode error:", e) print("Text was:", repr(response.text)) return {"written_url_in_message": written_url, "full_result": None} else: print("Bad response:", response.status_code, response.text) return {"written_url_in_message": written_url, "full_result": None} def parse_selected_metadata(written_url, full_result): try: select_change_dict = {} select_change_dict['written_url_in_message'] = written_url if "/#/c/" in written_url: select_change_dict['change_type'] = "/#/c/" else: select_change_dict['change_type'] = "just /r/" #getting ID select_change_dict['id'] = full_result['change_id'] select_change_dict['project'] = full_result['project'] select_change_dict['description'] = full_result['subject'] #getting owner name and email select_change_dict['owner_dict'] = full_result['owner'] select_change_dict['owner_email'] = full_result['owner']['email'] #current revision count select_change_dict['revision_count'] = full_result['current_revision_number'] #insertions select_change_dict['code_insertions'] = full_result['insertions'] #deletions select_change_dict['code_deletions'] = full_result['deletions'] #status select_change_dict['status'] = full_result['status'] #reviewers reviewers = full_result.get('reviewers', {}).get('REVIEWER', []) if reviewers: select_change_dict['reviewer_count'] = len(reviewers) select_change_dict['reviewers'] = reviewers else: select_change_dict['reviewer_count'] = "NA" select_change_dict['reviewers'] = "NA" #print(select_change_dict) return select_change_dict except Exception as e: print(f'Error in this URL: {written_url}') print(f'KeyError in this dictionary: {full_result}') return {} df['gerrit_full_results'] = df["gerrit_change_urls"].progress_apply( lambda urls: [get_change_details(url) for url in urls] if urls else [] ) df['selected_gerrit_results'] = df['gerrit_full_results'].progress_apply( lambda results: [parse_selected_metadata(item['written_url_in_message'], item['full_result']) for item in results if item.get('full_result') and item.get('written_url_in_message')] ) return df def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_change_urls"): pattern = r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?[0-9]+(?:/[^\s]*)?" df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg)) return df if __name__ == "__main__": df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv") #df = df.head(300) df = add_gerrit_urls_column(df) df = query_gerrit_changes(df) df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080425_gerrit_filled_df.csv", index=False) #print(df) #query_change_tail("https://gerrit.wikimedia.org/r/85783")