mw-convo-collections/072525_gerrit_collection/gerrit_patch_collection.py

import os, sys
import json
import numpy as np
import pandas as pd
import requests
import re
import datetime
import time
from tqdm import tqdm
#from urllib.parse import quote_plus

#from requests.auth import HTTPDigestAuth
#from pygerrit2 import GerritRestAPI, HTTPBasicAuth

# format of the Gerrit links: https://gerrit.wikimedia.org/r/85783
# but needs to go in like this https://gerrit.wikimedia.org/r/changes/85783
# curl https://gerrit.wikimedia.org/r/changes/Ic69c2ad275389a31c9fbaf47f3665dcdbb7ac2af/detail

def query_change_detail(
        written_url,
        sleep = 10
        ):
    time.sleep(sleep)
    short_change_id = written_url.split("/")[-1]

    username = os.environ.get("GERRIT_USERNAME")
    http_password = os.environ.get("GERRIT_HTTP_PASSWORD")

    api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"

    response = requests.get(api_url, auth=(username, http_password), headers={'Content-Type': 'application/json'})

    result = json.loads(response.text[5:])

    select_change_dict = {}
    #making note of what the url was in the message
    select_change_dict['written_url_in_message'] = written_url
    result['written_url_in_message'] = written_url
    #getting ID
    select_change_dict['id'] = result['change_id']
    select_change_dict['project'] = result['project']
    select_change_dict['description'] = result['subject']
    #getting owner name and email
    select_change_dict['owner_dict'] = result['owner']
    select_change_dict['owner_email'] = result['owner']['email']
    #current revision count
    select_change_dict['revision_count'] = result['current_revision_number']
    #insertions
    select_change_dict['code_insertions'] = result['insertions']
    #deletions
    select_change_dict['code_deletions'] = result['deletions']
    #status
    select_change_dict['status'] = result['status']
    #reviewers
    select_change_dict['reviewer_count'] = len(result['reviewers']['REVIEWER'])
    select_change_dict['reviewers'] = result['reviewers']['REVIEWER']
    print(result)
    print(select_change_dict)
    return [select_change_dict, result]


def query_gerrit_changes(df, sleep=25):
    tqdm.pandas()
    #get the information from the Gerrit change from the URL that's written in the message
    def get_change_details(written_url):
        time.sleep(sleep)
        #short_change_id = written_url.split("/")[-1]
        match = re.search(r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?([0-9]+)(?:/|$)", written_url)
        if match:
            short_change_id = match.group(1)
        else:
            print(f"Error: No change ID found in URL: {written_url}")
            return {"written_url_in_message": written_url, "full_result": None}
        api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"

        username = os.environ.get("GERRIT_USERNAME")
        http_password = os.environ.get("GERRIT_HTTP_PASSWORD")

        response = requests.get(api_url, auth=(username, http_password), headers={'Content-Type': 'application/json'})
        if response.status_code == 200:
            try:
                result = json.loads(response.text[5:])
                return {"written_url_in_message": written_url, "full_result": result}
            except Exception as e:
                print("JSON decode error:", e)
                print("Text was:", repr(response.text))
                return {"written_url_in_message": written_url, "full_result": None}
        else:
            print("Bad response:", response.status_code, response.text)
            return {"written_url_in_message": written_url, "full_result": None}

    def parse_selected_metadata(written_url, full_result):
        try:
            select_change_dict = {}
            select_change_dict['written_url_in_message'] = written_url
            if "/#/c/" in written_url:
                select_change_dict['change_type'] = "/#/c/"
            else:
                select_change_dict['change_type'] = "just /r/"
            #getting ID
            select_change_dict['id'] = full_result['change_id']
            select_change_dict['project'] = full_result['project']
            select_change_dict['description'] = full_result['subject']
            #getting owner name and email
            select_change_dict['owner_dict'] = full_result['owner']
            select_change_dict['owner_email'] = full_result['owner']['email']
            #current revision count
            select_change_dict['revision_count'] = full_result['current_revision_number']
            #insertions
            select_change_dict['code_insertions'] = full_result['insertions']
            #deletions
            select_change_dict['code_deletions'] = full_result['deletions']
            #status
            select_change_dict['status'] = full_result['status']
            #reviewers
            reviewers = full_result.get('reviewers', {}).get('REVIEWER', [])
            if reviewers:
                select_change_dict['reviewer_count'] = len(reviewers)
                select_change_dict['reviewers'] = reviewers
            else:
                select_change_dict['reviewer_count'] = "NA"
                select_change_dict['reviewers']  = "NA"
            #print(select_change_dict)
            return select_change_dict
        except Exception as e:
            print(f'Error in this URL: {written_url}')
            print(f'KeyError in this dictionary: {full_result}')
            return {}

    df['gerrit_full_results'] = df["gerrit_change_urls"].progress_apply(
        lambda urls: [get_change_details(url) for url in urls] if urls else []
    )

    df['selected_gerrit_results'] = df['gerrit_full_results'].progress_apply(
        lambda results: [parse_selected_metadata(item['written_url_in_message'], item['full_result'])
                        for item in results
                        if item.get('full_result') and item.get('written_url_in_message')]
    )

    return df

def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_change_urls"):
    pattern = r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?[0-9]+(?:/[^\s]*)?"
    df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg))
    return df

if __name__ == "__main__":
    df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv")
    #df = df.head(300)
    df = add_gerrit_urls_column(df)
    df = query_gerrit_changes(df)
    #df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080725_gerrit_filled_df.csv", index=False)
    #print(df)
    #query_change_tail("https://gerrit.wikimedia.org/r/85783")