1
0

updates to gerrit collection, at least intermediate thing for gerrit results

This commit is contained in:
Matthew Gaughan 2025-08-04 16:36:45 -05:00
parent 6a760decfe
commit 542a4f5323
2 changed files with 151779 additions and 2 deletions

File diff suppressed because one or more lines are too long

View File

@ -6,6 +6,7 @@ import requests
import re import re
import datetime import datetime
import time import time
from tqdm import tqdm
#from urllib.parse import quote_plus #from urllib.parse import quote_plus
#from requests.auth import HTTPDigestAuth #from requests.auth import HTTPDigestAuth
@ -22,13 +23,19 @@ def query_change_detail(
time.sleep(sleep) time.sleep(sleep)
short_change_id = written_url.split("/")[-1] short_change_id = written_url.split("/")[-1]
username = os.environ.get("GERRIT_USERNAME")
http_password = os.environ.get("GERRIT_HTTP_PASSWORD")
api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail" api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"
response = requests.get(api_url, headers={'Content-Type': 'application/json'}) response = requests.get(api_url, auth=(username, http_password), headers={'Content-Type': 'application/json'})
result = json.loads(response.text[5:]) result = json.loads(response.text[5:])
select_change_dict = {} select_change_dict = {}
#making note of what the url was in the message
select_change_dict['written_url_in_message'] = written_url
result['written_url_in_message'] = written_url
#getting ID #getting ID
select_change_dict['id'] = result['change_id'] select_change_dict['id'] = result['change_id']
select_change_dict['project'] = result['project'] select_change_dict['project'] = result['project']
@ -49,7 +56,89 @@ def query_change_detail(
select_change_dict['reviewers'] = result['reviewers']['REVIEWER'] select_change_dict['reviewers'] = result['reviewers']['REVIEWER']
print(result) print(result)
print(select_change_dict) print(select_change_dict)
return [select_change_dict, result]
def query_gerrit_changes(df, sleep=20):
tqdm.pandas()
#get the information from the Gerrit change from the URL that's written in the message
def get_change_details(written_url):
time.sleep(sleep)
short_change_id = written_url.split("/")[-1]
api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"
username = os.environ.get("GERRIT_USERNAME")
http_password = os.environ.get("GERRIT_HTTP_PASSWORD")
response = requests.get(api_url, auth=(username, http_password), headers={'Content-Type': 'application/json'})
if response.status_code == 200:
try:
result = json.loads(response.text[5:])
return {"written_url_in_message": written_url, "full_result": result}
except Exception as e:
print("JSON decode error:", e)
print("Text was:", repr(response.text))
return {"written_url_in_message": written_url, "full_result": None}
else:
print("Bad response:", response.status_code, response.text)
return {"written_url_in_message": written_url, "full_result": None}
def parse_selected_metadata(written_url, full_result):
try:
select_change_dict = {}
select_change_dict['written_url_in_message'] = written_url
#getting ID
select_change_dict['id'] = full_result['change_id']
select_change_dict['project'] = full_result['project']
select_change_dict['description'] = full_result['subject']
#getting owner name and email
select_change_dict['owner_dict'] = full_result['owner']
select_change_dict['owner_email'] = full_result['owner']['email']
#current revision count
select_change_dict['revision_count'] = full_result['current_revision_number']
#insertions
select_change_dict['code_insertions'] = full_result['insertions']
#deletions
select_change_dict['code_deletions'] = full_result['deletions']
#status
select_change_dict['status'] = full_result['status']
#reviewers
reviewers = full_result.get('reviewers', {}).get('REVIEWER', [])
if reviewers:
select_change_dict['reviewer_count'] = len(reviewers)
select_change_dict['reviewers'] = reviewers
else:
select_change_dict['reviewer_count'] = "NA"
select_change_dict['reviewers'] = "NA"
#print(select_change_dict)
return select_change_dict
except Exception as e:
print(f'Error in this URL: {written_url}')
print(f'KeyError in this dictionary: {full_result}')
return {}
df['gerrit_full_results'] = df["gerrit_urls"].progress_apply(
lambda urls: [get_change_details(url) for url in urls] if urls else []
)
df['selected_gerrit_results'] = df['gerrit_full_results'].progress_apply(
lambda results: [parse_selected_metadata(item['written_url_in_message'], item['full_result'])
for item in results
if item.get('full_result') and item.get('written_url_in_message')]
)
return df
def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_urls"):
pattern = r"(https://gerrit\.wikimedia\.org[^\s]*)"
df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg))
return df
if __name__ == "__main__": if __name__ == "__main__":
query_change_detail("https://gerrit.wikimedia.org/r/85783", 1) df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv")
#df = df.head(30)
df = add_gerrit_urls_column(df)
df = query_gerrit_changes(df)
df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080425_gerrit_filled_df.csv", index=False)
#query_change_tail("https://gerrit.wikimedia.org/r/85783")