updating the gerrit data, let's see if it's good data

2025-08-07 20:03:47 -05:00 · 2025-08-07 20:03:47 -05:00 · e922dae272
commit e922dae272
parent 542a4f5323
2 changed files with 151705 additions and 7 deletions
--- a/072525_gerrit_collection/080425_gerrit_filled_df.csv
+++ b/072525_gerrit_collection/080425_gerrit_filled_df.csv
--- a/072525_gerrit_collection/gerrit_patch_collection.py
+++ b/072525_gerrit_collection/gerrit_patch_collection.py
@ -59,12 +59,18 @@ def query_change_detail(
    return [select_change_dict, result]
-def query_gerrit_changes(df, sleep=20):
+def query_gerrit_changes(df, sleep=25):
    tqdm.pandas()
    #get the information from the Gerrit change from the URL that's written in the message
    def get_change_details(written_url):
        time.sleep(sleep)
-        short_change_id = written_url.split("/")[-1]
+        #short_change_id = written_url.split("/")[-1]
        match = re.search(r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?([0-9]+)(?:/|$)", written_url)
        if match:
            short_change_id = match.group(1)
        else:
            print(f"Error: No change ID found in URL: {written_url}")
            return {"written_url_in_message": written_url, "full_result": None}
        api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"
        username = os.environ.get("GERRIT_USERNAME")
@ -87,6 +93,10 @@ def query_gerrit_changes(df, sleep=20):
        try:
            select_change_dict = {}
            select_change_dict['written_url_in_message'] = written_url
            if "/#/c/" in written_url:
                select_change_dict['change_type'] = "/#/c/"
            else:
                select_change_dict['change_type'] = "just /r/"
            #getting ID
            select_change_dict['id'] = full_result['change_id']
            select_change_dict['project'] = full_result['project']
@ -117,7 +127,7 @@ def query_gerrit_changes(df, sleep=20):
            print(f'KeyError in this dictionary: {full_result}')
            return {}
-    df['gerrit_full_results'] = df["gerrit_urls"].progress_apply(
+    df['gerrit_full_results'] = df["gerrit_change_urls"].progress_apply(
        lambda urls: [get_change_details(url) for url in urls] if urls else []
    )
@ -129,16 +139,16 @@ def query_gerrit_changes(df, sleep=20):
    return df 
-def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_urls"):
+def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_change_urls"):
-    pattern = r"(https://gerrit\.wikimedia\.org[^\s]*)"
+    pattern = r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?[0-9]+(?:/[^\s]*)?"
    df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg))
    return df
 if __name__ == "__main__":
    df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv") 
-    #df = df.head(30)
+    #df = df.head(300)
    df = add_gerrit_urls_column(df)
    df = query_gerrit_changes(df)
    df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080425_gerrit_filled_df.csv", index=False)
-
+    #print(df)
    #query_change_tail("https://gerrit.wikimedia.org/r/85783")