updating with OLMO-generated classifications
This commit is contained in:
parent
9e4c05e347
commit
3e21ac1bb7
266806
p2/quest/072525_biberplus_labels.csv
Normal file
266806
p2/quest/072525_biberplus_labels.csv
Normal file
File diff suppressed because one or more lines are too long
266806
p2/quest/072525_olmo_messages_categorized.csv
Normal file
266806
p2/quest/072525_olmo_messages_categorized.csv
Normal file
File diff suppressed because one or more lines are too long
36
p2/quest/cleaned_biberplus-categorization.log
Normal file
36
p2/quest/cleaned_biberplus-categorization.log
Normal file
@ -0,0 +1,36 @@
|
||||
starting the job at: Fri Jul 25 20:48:01 CDT 2025
|
||||
setting up the environment
|
||||
running the biberplus labeling script
|
||||
26024
|
||||
26024
|
||||
id ... http_flag
|
||||
0 56791 ... NaN
|
||||
1 269631 ... NaN
|
||||
2 269628 ... NaN
|
||||
3 269622 ... NaN
|
||||
4 56737 ... NaN
|
||||
... ... ... ...
|
||||
26019 403186 ... True
|
||||
26020 78646 ... True
|
||||
26021 429163 ... True
|
||||
26022 429137 ... True
|
||||
26023 418783 ... True
|
||||
|
||||
[26024 rows x 22 columns]
|
||||
id ... cleaned_messages
|
||||
0 56791 ... pawn character editing\n\nseen on master branc...
|
||||
1 269631 ... Change 86685 merged by jenkins-bot:\nFollow-up...
|
||||
2 269628 ... *** Bug 54785 has been marked as a duplicate o...
|
||||
3 269622 ... Change 86685 had a related patch set uploaded ...
|
||||
4 56737 ... **Author:** CODE\n\n**Description:**\nAfter th...
|
||||
... ... ... ...
|
||||
26019 403186 ... Could you attach a screenshot please? Drag & d...
|
||||
26020 78646 ... Hi,\n\nWe have a wiki which has a part which c...
|
||||
26021 429163 ... Sorry for not reply-ing. I did a test and coul...
|
||||
26022 429137 ... SCREEN_NAME: Please answer.
|
||||
26023 418783 ... I cannot replicate this. What's the name of th...
|
||||
|
||||
[26024 rows x 122 columns]
|
||||
biberplus labeling pau
|
||||
job finished, cleaning up
|
||||
job pau at: Fri Jul 25 20:55:26 CDT 2025
|
@ -1,88 +1,8 @@
|
||||
setting up the environment by loading in conda environment at Fri Jul 25 15:31:16 CDT 2025
|
||||
running the bertopic job at Fri Jul 25 15:31:16 CDT 2025
|
||||
setting up the environment by loading in conda environment at Fri Jul 25 21:20:22 CDT 2025
|
||||
running the bertopic job at Fri Jul 25 21:20:23 CDT 2025
|
||||
cuda
|
||||
NVIDIA A100-SXM4-80GB
|
||||
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=8a2376e5-9aa1-4450-7c89-6e41bdbc6af6, L2_cache_size=40MB)
|
||||
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:04, 2.71it/s]
Loading checkpoint shards: 17%|█▋ | 2/12 [00:00<00:05, 1.98it/s]
Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:05, 1.75it/s]
Loading checkpoint shards: 33%|███▎ | 4/12 [00:02<00:04, 1.71it/s]
Loading checkpoint shards: 42%|████▏ | 5/12 [00:02<00:04, 1.73it/s]
Loading checkpoint shards: 50%|█████ | 6/12 [00:03<00:03, 1.72it/s]
Loading checkpoint shards: 58%|█████▊ | 7/12 [00:03<00:02, 1.72it/s]
Loading checkpoint shards: 67%|██████▋ | 8/12 [00:04<00:02, 1.67it/s]
Loading checkpoint shards: 75%|███████▌ | 9/12 [00:05<00:01, 1.72it/s]
Loading checkpoint shards: 83%|████████▎ | 10/12 [00:05<00:01, 1.66it/s]
Loading checkpoint shards: 92%|█████████▏| 11/12 [00:06<00:00, 1.78it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:06<00:00, 1.90it/s]
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
ACTION ON ISSUE
|
||||
this is the response:::: ----------------------------
|
||||
SOLUTION DISCUSSION
|
||||
this is the response:::: ----------------------------
|
||||
ACTION ON ISSUE
|
||||
this is the response:::: ----------------------------
|
||||
SOLUTION DISCUSSION
|
||||
this is the response:::: ----------------------------
|
||||
SOLUTION DISCUSSION
|
||||
this is the response:::: ----------------------------
|
||||
SOLUTION USAGE
|
||||
this is the response:::: ----------------------------
|
||||
ACTION ON ISSUE
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
ACTION ON ISSUE
|
||||
this is the response:::: ----------------------------
|
||||
SOLUTION USAGE
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
SOLUTION USAGE
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
this is the response:::: ----------------------------
|
||||
SOLUTION DISCUSSION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
INVESTIGATION AND EXPLORATION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
ACTION ON ISSUE
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
ACTION ON ISSUE
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
BUG REPRODUCTION
|
||||
this is the response:::: ----------------------------
|
||||
ACTION ON ISSUE
|
||||
this is the response:::: ----------------------------
|
||||
ACTION ON ISSUE
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
this is the response:::: ----------------------------
|
||||
TASK PROGRESS
|
||||
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=6e26de77-c067-13c4-e9e0-8200eb5a348f, L2_cache_size=40MB)
|
||||
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:03, 2.82it/s]
Loading checkpoint shards: 17%|█▋ | 2/12 [00:00<00:04, 2.13it/s]
Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:04, 1.96it/s]
Loading checkpoint shards: 33%|███▎ | 4/12 [00:02<00:04, 1.86it/s]
Loading checkpoint shards: 42%|████▏ | 5/12 [00:02<00:03, 1.86it/s]
Loading checkpoint shards: 50%|█████ | 6/12 [00:03<00:03, 1.76it/s]
Loading checkpoint shards: 58%|█████▊ | 7/12 [00:03<00:02, 1.74it/s]
Loading checkpoint shards: 67%|██████▋ | 8/12 [00:04<00:02, 1.68it/s]
Loading checkpoint shards: 75%|███████▌ | 9/12 [00:04<00:01, 1.71it/s]
Loading checkpoint shards: 83%|████████▎ | 10/12 [00:05<00:01, 1.73it/s]
Loading checkpoint shards: 92%|█████████▏| 11/12 [00:06<00:00, 1.83it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:06<00:00, 1.98it/s]
|
||||
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
|
||||
unsupervised olmo categorization pau at Sat Jul 26 12:23:56 CDT 2025
|
||||
|
@ -33,6 +33,7 @@ import numpy as np
|
||||
from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies
|
||||
import cupy
|
||||
import random
|
||||
import re
|
||||
|
||||
def biberplus_labeler(text):
|
||||
print(len(text))
|
||||
@ -40,14 +41,35 @@ def biberplus_labeler(text):
|
||||
config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
|
||||
pipeline = load_pipeline(config)
|
||||
features_list = []
|
||||
cleaned_messages = []
|
||||
for message in text:
|
||||
message_label = calculate_tag_frequencies(message, pipeline, config)
|
||||
|
||||
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
||||
# 1. replace code with CODE
|
||||
comment_text = re.sub(r'`[^`]+`', 'CODE', message) # Inline code
|
||||
comment_text = re.sub(r'```[\s\S]+?```', 'CODE', comment_text) # Block code
|
||||
# 2. replace quotes with QUOTE
|
||||
lines = comment_text.split('\n')
|
||||
lines = ['QUOTE' if line.strip().startswith('>') else line for line in lines]
|
||||
comment_text = '\n'.join(lines)
|
||||
# 3. replace Gerrit URLs with GERRIT URL
|
||||
gerrit_url_pattern = r'https://gerrit\.wikimedia\.org/r/\d+'
|
||||
comment_text = re.sub(gerrit_url_pattern, 'GERRIT_URL', comment_text)
|
||||
# replace URL with URL
|
||||
url_pattern = r'https?://[^\s]+'
|
||||
comment_text = re.sub(url_pattern, 'URL', comment_text)
|
||||
# 4. if possible, replace @ with SCREEN_NAME
|
||||
cleaned_message = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
|
||||
cleaned_messages.append(cleaned_message)
|
||||
|
||||
message_label = calculate_tag_frequencies(cleaned_message, pipeline, config)
|
||||
mean_row = message_label.set_index('tag')['mean']
|
||||
mean_row = mean_row.rename(lambda tag: f"normalized_{tag}")
|
||||
features_list.append(mean_row)
|
||||
print(len(features_list))
|
||||
frequencies_df = pd.DataFrame(features_list)
|
||||
frequencies_df['message'] = text
|
||||
frequencies_df['cleaned_messages'] = cleaned_messages
|
||||
frequencies_df = frequencies_df.reset_index(drop=True)
|
||||
return frequencies_df
|
||||
|
||||
@ -85,6 +107,6 @@ if __name__ == "__main__":
|
||||
assert len(first_discussion_df) == len(final_discussion_df)
|
||||
final_discussion_df = final_discussion_df.drop(columns=["message"])
|
||||
# if passing the prior asserts, let's write to a csv
|
||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False)
|
||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_biberplus_labels.csv", index=False)
|
||||
print('biberplus labeling pau')
|
||||
|
||||
|
@ -74,15 +74,30 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
|
||||
text_dict['task_title'] = row[1]
|
||||
text_dict['comment_text'] = row[2]
|
||||
text_dict['comment_type'] = row[12]
|
||||
raw_text = text_dict['comment_text']
|
||||
|
||||
#TODO: comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
||||
#print(raw_text)
|
||||
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
||||
# 1. replace code with CODE
|
||||
comment_text = re.sub(r'`[^`]+`', 'CODE', raw_text) # Inline code
|
||||
comment_text = re.sub(r'```[\s\S]+?```', 'CODE', comment_text) # Block code
|
||||
# 2. replace quotes with QUOTE
|
||||
# 3. replace URLs with URL
|
||||
lines = comment_text.split('\n')
|
||||
lines = ['QUOTE' if line.strip().startswith('>') else line for line in lines]
|
||||
comment_text = '\n'.join(lines)
|
||||
# 3. replace Gerrit URLs with GERRIT URL
|
||||
gerrit_url_pattern = r'https://gerrit\.wikimedia\.org/r/\d+'
|
||||
comment_text = re.sub(gerrit_url_pattern, 'GERRIT_URL', comment_text)
|
||||
# replace URL with URL
|
||||
url_pattern = r'https?://[^\s]+'
|
||||
comment_text = re.sub(url_pattern, 'URL', comment_text)
|
||||
# 4. if possible, replace @ with SCREEN_NAME
|
||||
comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
|
||||
#print(comment_text)
|
||||
|
||||
text_dict['cleaned_comment_text'] = comment_text
|
||||
#build out prompt construction; more specificity in data provided
|
||||
given_data = f"**GIVEN COMMENT: \n ' Type -{text_dict['comment_type']} \n Text -{text_dict['comment_text']}**'\n"
|
||||
given_data = f"**GIVEN COMMENT: \n ' Type -{text_dict['comment_type']} \n Text -{text_dict['cleaned_comment_text']}**'\n"
|
||||
prompt_question="What do you think about this message? What are they saying?"
|
||||
#prompt = f"{prompt_1}\n\n{example_1}\n\n{example_2}\n\n{example_3}\n\n{example_4}\n\n{given_data}\n"
|
||||
prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}"
|
||||
@ -91,7 +106,8 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
|
||||
#deterministic sampling and getting the response back
|
||||
response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False)
|
||||
response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0]
|
||||
print("this is the response:::: ----------------------------")
|
||||
|
||||
#print("this is the response:::: ----------------------------")
|
||||
#print(response_txt)
|
||||
#getting the resulting codes
|
||||
#codes_id = response_txt.rfind("CATEGORIES:")
|
||||
@ -101,7 +117,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
|
||||
following_text = match.group(1).strip("[]*")
|
||||
else:
|
||||
following_text = "NO CATEGORY"
|
||||
print(following_text)
|
||||
#print(following_text)
|
||||
text_dict['olmo_category'] = following_text
|
||||
'''
|
||||
for item in result.strip(";").split(";"):
|
||||
@ -112,10 +128,8 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
|
||||
cite_dict[key] = value
|
||||
'''
|
||||
array_of_categorizations.append(text_dict)
|
||||
if index > 40:
|
||||
break
|
||||
#CSV everything
|
||||
df = pd.DataFrame(array_of_categorizations)
|
||||
#df.to_csv('072525_olmo_messages_categorized.csv', index=False)
|
||||
df.to_csv('072525_olmo_messages_categorized.csv', index=False)
|
||||
|
||||
|
||||
|
@ -8,7 +8,7 @@
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --cpus-per-task=4
|
||||
#SBATCH --job-name=biberplus-categorization
|
||||
#SBATCH --output=biberplus-categorization.log
|
||||
#SBATCH --output=cleaned_biberplus-categorization.log
|
||||
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||
|
||||
|
@ -24,3 +24,5 @@ conda activate olmo
|
||||
echo "running the bertopic job at $(date)"
|
||||
|
||||
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/info_labeling.py
|
||||
|
||||
echo "unsupervised olmo categorization pau at $(date)"
|
||||
|
Loading…
Reference in New Issue
Block a user