1
0

updating the gerrit data, let's see if it's good data

This commit is contained in:
Matthew Gaughan 2025-08-07 20:03:47 -05:00
parent 542a4f5323
commit e922dae272
2 changed files with 151705 additions and 7 deletions

File diff suppressed because one or more lines are too long

View File

@ -59,12 +59,18 @@ def query_change_detail(
return [select_change_dict, result] return [select_change_dict, result]
def query_gerrit_changes(df, sleep=20): def query_gerrit_changes(df, sleep=25):
tqdm.pandas() tqdm.pandas()
#get the information from the Gerrit change from the URL that's written in the message #get the information from the Gerrit change from the URL that's written in the message
def get_change_details(written_url): def get_change_details(written_url):
time.sleep(sleep) time.sleep(sleep)
short_change_id = written_url.split("/")[-1] #short_change_id = written_url.split("/")[-1]
match = re.search(r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?([0-9]+)(?:/|$)", written_url)
if match:
short_change_id = match.group(1)
else:
print(f"Error: No change ID found in URL: {written_url}")
return {"written_url_in_message": written_url, "full_result": None}
api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail" api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"
username = os.environ.get("GERRIT_USERNAME") username = os.environ.get("GERRIT_USERNAME")
@ -87,6 +93,10 @@ def query_gerrit_changes(df, sleep=20):
try: try:
select_change_dict = {} select_change_dict = {}
select_change_dict['written_url_in_message'] = written_url select_change_dict['written_url_in_message'] = written_url
if "/#/c/" in written_url:
select_change_dict['change_type'] = "/#/c/"
else:
select_change_dict['change_type'] = "just /r/"
#getting ID #getting ID
select_change_dict['id'] = full_result['change_id'] select_change_dict['id'] = full_result['change_id']
select_change_dict['project'] = full_result['project'] select_change_dict['project'] = full_result['project']
@ -117,7 +127,7 @@ def query_gerrit_changes(df, sleep=20):
print(f'KeyError in this dictionary: {full_result}') print(f'KeyError in this dictionary: {full_result}')
return {} return {}
df['gerrit_full_results'] = df["gerrit_urls"].progress_apply( df['gerrit_full_results'] = df["gerrit_change_urls"].progress_apply(
lambda urls: [get_change_details(url) for url in urls] if urls else [] lambda urls: [get_change_details(url) for url in urls] if urls else []
) )
@ -129,16 +139,16 @@ def query_gerrit_changes(df, sleep=20):
return df return df
def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_urls"): def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_change_urls"):
pattern = r"(https://gerrit\.wikimedia\.org[^\s]*)" pattern = r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?[0-9]+(?:/[^\s]*)?"
df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg)) df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg))
return df return df
if __name__ == "__main__": if __name__ == "__main__":
df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv") df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv")
#df = df.head(30) #df = df.head(300)
df = add_gerrit_urls_column(df) df = add_gerrit_urls_column(df)
df = query_gerrit_changes(df) df = query_gerrit_changes(df)
df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080425_gerrit_filled_df.csv", index=False) df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080425_gerrit_filled_df.csv", index=False)
#print(df)
#query_change_tail("https://gerrit.wikimedia.org/r/85783") #query_change_tail("https://gerrit.wikimedia.org/r/85783")