updating the gerrit data, let's see if it's good data
This commit is contained in:
parent
542a4f5323
commit
e922dae272
151688
072525_gerrit_collection/080425_gerrit_filled_df.csv
Normal file
151688
072525_gerrit_collection/080425_gerrit_filled_df.csv
Normal file
File diff suppressed because one or more lines are too long
@ -59,12 +59,18 @@ def query_change_detail(
|
|||||||
return [select_change_dict, result]
|
return [select_change_dict, result]
|
||||||
|
|
||||||
|
|
||||||
def query_gerrit_changes(df, sleep=20):
|
def query_gerrit_changes(df, sleep=25):
|
||||||
tqdm.pandas()
|
tqdm.pandas()
|
||||||
#get the information from the Gerrit change from the URL that's written in the message
|
#get the information from the Gerrit change from the URL that's written in the message
|
||||||
def get_change_details(written_url):
|
def get_change_details(written_url):
|
||||||
time.sleep(sleep)
|
time.sleep(sleep)
|
||||||
short_change_id = written_url.split("/")[-1]
|
#short_change_id = written_url.split("/")[-1]
|
||||||
|
match = re.search(r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?([0-9]+)(?:/|$)", written_url)
|
||||||
|
if match:
|
||||||
|
short_change_id = match.group(1)
|
||||||
|
else:
|
||||||
|
print(f"Error: No change ID found in URL: {written_url}")
|
||||||
|
return {"written_url_in_message": written_url, "full_result": None}
|
||||||
api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"
|
api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"
|
||||||
|
|
||||||
username = os.environ.get("GERRIT_USERNAME")
|
username = os.environ.get("GERRIT_USERNAME")
|
||||||
@ -87,6 +93,10 @@ def query_gerrit_changes(df, sleep=20):
|
|||||||
try:
|
try:
|
||||||
select_change_dict = {}
|
select_change_dict = {}
|
||||||
select_change_dict['written_url_in_message'] = written_url
|
select_change_dict['written_url_in_message'] = written_url
|
||||||
|
if "/#/c/" in written_url:
|
||||||
|
select_change_dict['change_type'] = "/#/c/"
|
||||||
|
else:
|
||||||
|
select_change_dict['change_type'] = "just /r/"
|
||||||
#getting ID
|
#getting ID
|
||||||
select_change_dict['id'] = full_result['change_id']
|
select_change_dict['id'] = full_result['change_id']
|
||||||
select_change_dict['project'] = full_result['project']
|
select_change_dict['project'] = full_result['project']
|
||||||
@ -117,7 +127,7 @@ def query_gerrit_changes(df, sleep=20):
|
|||||||
print(f'KeyError in this dictionary: {full_result}')
|
print(f'KeyError in this dictionary: {full_result}')
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
df['gerrit_full_results'] = df["gerrit_urls"].progress_apply(
|
df['gerrit_full_results'] = df["gerrit_change_urls"].progress_apply(
|
||||||
lambda urls: [get_change_details(url) for url in urls] if urls else []
|
lambda urls: [get_change_details(url) for url in urls] if urls else []
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -129,16 +139,16 @@ def query_gerrit_changes(df, sleep=20):
|
|||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_urls"):
|
def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_change_urls"):
|
||||||
pattern = r"(https://gerrit\.wikimedia\.org[^\s]*)"
|
pattern = r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?[0-9]+(?:/[^\s]*)?"
|
||||||
df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg))
|
df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg))
|
||||||
return df
|
return df
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv")
|
df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv")
|
||||||
#df = df.head(30)
|
#df = df.head(300)
|
||||||
df = add_gerrit_urls_column(df)
|
df = add_gerrit_urls_column(df)
|
||||||
df = query_gerrit_changes(df)
|
df = query_gerrit_changes(df)
|
||||||
df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080425_gerrit_filled_df.csv", index=False)
|
df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080425_gerrit_filled_df.csv", index=False)
|
||||||
|
#print(df)
|
||||||
#query_change_tail("https://gerrit.wikimedia.org/r/85783")
|
#query_change_tail("https://gerrit.wikimedia.org/r/85783")
|
||||||
|
Loading…
Reference in New Issue
Block a user