updating the gerrit data, let's see if it's good data
This commit is contained in:
parent
542a4f5323
commit
e922dae272
151688
072525_gerrit_collection/080425_gerrit_filled_df.csv
Normal file
151688
072525_gerrit_collection/080425_gerrit_filled_df.csv
Normal file
File diff suppressed because one or more lines are too long
@ -59,12 +59,18 @@ def query_change_detail(
|
||||
return [select_change_dict, result]
|
||||
|
||||
|
||||
def query_gerrit_changes(df, sleep=20):
|
||||
def query_gerrit_changes(df, sleep=25):
|
||||
tqdm.pandas()
|
||||
#get the information from the Gerrit change from the URL that's written in the message
|
||||
def get_change_details(written_url):
|
||||
time.sleep(sleep)
|
||||
short_change_id = written_url.split("/")[-1]
|
||||
#short_change_id = written_url.split("/")[-1]
|
||||
match = re.search(r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?([0-9]+)(?:/|$)", written_url)
|
||||
if match:
|
||||
short_change_id = match.group(1)
|
||||
else:
|
||||
print(f"Error: No change ID found in URL: {written_url}")
|
||||
return {"written_url_in_message": written_url, "full_result": None}
|
||||
api_url = f"https://gerrit.wikimedia.org/r/changes/{short_change_id}/detail"
|
||||
|
||||
username = os.environ.get("GERRIT_USERNAME")
|
||||
@ -87,6 +93,10 @@ def query_gerrit_changes(df, sleep=20):
|
||||
try:
|
||||
select_change_dict = {}
|
||||
select_change_dict['written_url_in_message'] = written_url
|
||||
if "/#/c/" in written_url:
|
||||
select_change_dict['change_type'] = "/#/c/"
|
||||
else:
|
||||
select_change_dict['change_type'] = "just /r/"
|
||||
#getting ID
|
||||
select_change_dict['id'] = full_result['change_id']
|
||||
select_change_dict['project'] = full_result['project']
|
||||
@ -117,7 +127,7 @@ def query_gerrit_changes(df, sleep=20):
|
||||
print(f'KeyError in this dictionary: {full_result}')
|
||||
return {}
|
||||
|
||||
df['gerrit_full_results'] = df["gerrit_urls"].progress_apply(
|
||||
df['gerrit_full_results'] = df["gerrit_change_urls"].progress_apply(
|
||||
lambda urls: [get_change_details(url) for url in urls] if urls else []
|
||||
)
|
||||
|
||||
@ -129,16 +139,16 @@ def query_gerrit_changes(df, sleep=20):
|
||||
|
||||
return df
|
||||
|
||||
def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_urls"):
|
||||
pattern = r"(https://gerrit\.wikimedia\.org[^\s]*)"
|
||||
def add_gerrit_urls_column(df, text_column="comment_text", new_column="gerrit_change_urls"):
|
||||
pattern = r"https://gerrit\.wikimedia\.org/r/(?:#/c/)?[0-9]+(?:/[^\s]*)?"
|
||||
df[new_column] = df[text_column].astype(str).apply(lambda msg: re.findall(pattern, msg))
|
||||
return df
|
||||
|
||||
if __name__ == "__main__":
|
||||
df = pd.read_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/071425_master_discussion_data.csv")
|
||||
#df = df.head(30)
|
||||
#df = df.head(300)
|
||||
df = add_gerrit_urls_column(df)
|
||||
df = query_gerrit_changes(df)
|
||||
df.to_csv("/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/072525_gerrit_collection/080425_gerrit_filled_df.csv", index=False)
|
||||
|
||||
#print(df)
|
||||
#query_change_tail("https://gerrit.wikimedia.org/r/85783")
|
||||
|
Loading…
Reference in New Issue
Block a user