updating with OLMO-generated classifications
This commit is contained in:
		
							parent
							
								
									9e4c05e347
								
							
						
					
					
						commit
						3e21ac1bb7
					
				
							
								
								
									
										266806
									
								
								p2/quest/072525_biberplus_labels.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										266806
									
								
								p2/quest/072525_biberplus_labels.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										266806
									
								
								p2/quest/072525_olmo_messages_categorized.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										266806
									
								
								p2/quest/072525_olmo_messages_categorized.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										36
									
								
								p2/quest/cleaned_biberplus-categorization.log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								p2/quest/cleaned_biberplus-categorization.log
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,36 @@ | |||||||
|  | starting the job at: Fri Jul 25 20:48:01 CDT 2025 | ||||||
|  | setting up the environment | ||||||
|  | running the biberplus labeling script | ||||||
|  | 26024 | ||||||
|  | 26024 | ||||||
|  |            id  ... http_flag | ||||||
|  | 0       56791  ...       NaN | ||||||
|  | 1      269631  ...       NaN | ||||||
|  | 2      269628  ...       NaN | ||||||
|  | 3      269622  ...       NaN | ||||||
|  | 4       56737  ...       NaN | ||||||
|  | ...       ...  ...       ... | ||||||
|  | 26019  403186  ...      True | ||||||
|  | 26020   78646  ...      True | ||||||
|  | 26021  429163  ...      True | ||||||
|  | 26022  429137  ...      True | ||||||
|  | 26023  418783  ...      True | ||||||
|  | 
 | ||||||
|  | [26024 rows x 22 columns] | ||||||
|  |            id  ...                                   cleaned_messages | ||||||
|  | 0       56791  ...  pawn character editing\n\nseen on master branc... | ||||||
|  | 1      269631  ...  Change 86685 merged by jenkins-bot:\nFollow-up... | ||||||
|  | 2      269628  ...  *** Bug 54785 has been marked as a duplicate o... | ||||||
|  | 3      269622  ...  Change 86685 had a related patch set uploaded ... | ||||||
|  | 4       56737  ...  **Author:** CODE\n\n**Description:**\nAfter th... | ||||||
|  | ...       ...  ...                                                ... | ||||||
|  | 26019  403186  ...  Could you attach a screenshot please? Drag & d... | ||||||
|  | 26020   78646  ...  Hi,\n\nWe have a wiki which has a part which c... | ||||||
|  | 26021  429163  ...  Sorry for not reply-ing. I did a test and coul... | ||||||
|  | 26022  429137  ...                        SCREEN_NAME: Please answer. | ||||||
|  | 26023  418783  ...  I cannot replicate this. What's the name of th... | ||||||
|  | 
 | ||||||
|  | [26024 rows x 122 columns] | ||||||
|  | biberplus labeling pau | ||||||
|  | job finished, cleaning up | ||||||
|  | job pau at: Fri Jul 25 20:55:26 CDT 2025 | ||||||
| @ -1,88 +1,8 @@ | |||||||
| setting up the environment by loading in conda environment at Fri Jul 25 15:31:16 CDT 2025 | setting up the environment by loading in conda environment at Fri Jul 25 21:20:22 CDT 2025 | ||||||
| running the bertopic job at Fri Jul 25 15:31:16 CDT 2025 | running the bertopic job at Fri Jul 25 21:20:23 CDT 2025 | ||||||
| cuda | cuda | ||||||
| NVIDIA A100-SXM4-80GB | NVIDIA A100-SXM4-80GB | ||||||
| _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=8a2376e5-9aa1-4450-7c89-6e41bdbc6af6, L2_cache_size=40MB) | _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=6e26de77-c067-13c4-e9e0-8200eb5a348f, L2_cache_size=40MB) | ||||||
| 
Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards:   8%|▊         | 1/12 [00:00<00:04,  2.71it/s]
Loading checkpoint shards:  17%|█▋        | 2/12 [00:00<00:05,  1.98it/s]
Loading checkpoint shards:  25%|██▌       | 3/12 [00:01<00:05,  1.75it/s]
Loading checkpoint shards:  33%|███▎      | 4/12 [00:02<00:04,  1.71it/s]
Loading checkpoint shards:  42%|████▏     | 5/12 [00:02<00:04,  1.73it/s]
Loading checkpoint shards:  50%|█████     | 6/12 [00:03<00:03,  1.72it/s]
Loading checkpoint shards:  58%|█████▊    | 7/12 [00:03<00:02,  1.72it/s]
Loading checkpoint shards:  67%|██████▋   | 8/12 [00:04<00:02,  1.67it/s]
Loading checkpoint shards:  75%|███████▌  | 9/12 [00:05<00:01,  1.72it/s]
Loading checkpoint shards:  83%|████████▎ | 10/12 [00:05<00:01,  1.66it/s]
Loading checkpoint shards:  92%|█████████▏| 11/12 [00:06<00:00,  1.78it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:06<00:00,  1.90it/s] | 
Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards:   8%|▊         | 1/12 [00:00<00:03,  2.82it/s]
Loading checkpoint shards:  17%|█▋        | 2/12 [00:00<00:04,  2.13it/s]
Loading checkpoint shards:  25%|██▌       | 3/12 [00:01<00:04,  1.96it/s]
Loading checkpoint shards:  33%|███▎      | 4/12 [00:02<00:04,  1.86it/s]
Loading checkpoint shards:  42%|████▏     | 5/12 [00:02<00:03,  1.86it/s]
Loading checkpoint shards:  50%|█████     | 6/12 [00:03<00:03,  1.76it/s]
Loading checkpoint shards:  58%|█████▊    | 7/12 [00:03<00:02,  1.74it/s]
Loading checkpoint shards:  67%|██████▋   | 8/12 [00:04<00:02,  1.68it/s]
Loading checkpoint shards:  75%|███████▌  | 9/12 [00:04<00:01,  1.71it/s]
Loading checkpoint shards:  83%|████████▎ | 10/12 [00:05<00:01,  1.73it/s]
Loading checkpoint shards:  92%|█████████▏| 11/12 [00:06<00:00,  1.83it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:06<00:00,  1.98it/s] | ||||||
| this is the response:::: ---------------------------- | This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all. | ||||||
| BUG REPRODUCTION | unsupervised olmo categorization pau at Sat Jul 26 12:23:56 CDT 2025 | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| ACTION ON ISSUE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| SOLUTION DISCUSSION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| ACTION ON ISSUE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| SOLUTION DISCUSSION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| SOLUTION DISCUSSION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| SOLUTION USAGE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| ACTION ON ISSUE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| ACTION ON ISSUE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| SOLUTION USAGE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| SOLUTION USAGE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| SOLUTION DISCUSSION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| INVESTIGATION AND EXPLORATION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| ACTION ON ISSUE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| ACTION ON ISSUE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| BUG REPRODUCTION |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| ACTION ON ISSUE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| ACTION ON ISSUE |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
| this is the response:::: ---------------------------- |  | ||||||
| TASK PROGRESS |  | ||||||
|  | |||||||
| @ -33,6 +33,7 @@ import numpy as np | |||||||
| from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies  | from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies  | ||||||
| import cupy | import cupy | ||||||
| import random  | import random  | ||||||
|  | import re  | ||||||
| 
 | 
 | ||||||
| def biberplus_labeler(text): | def biberplus_labeler(text): | ||||||
|     print(len(text)) |     print(len(text)) | ||||||
| @ -40,14 +41,35 @@ def biberplus_labeler(text): | |||||||
|     config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100}) |     config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100}) | ||||||
|     pipeline = load_pipeline(config) |     pipeline = load_pipeline(config) | ||||||
|     features_list = [] |     features_list = [] | ||||||
|  |     cleaned_messages = [] | ||||||
|     for message in text: |     for message in text: | ||||||
|         message_label = calculate_tag_frequencies(message, pipeline, config) |              | ||||||
|  |         # comment_text preprocessing per https://arxiv.org/pdf/1902.07093 | ||||||
|  |         # 1. replace code with CODE | ||||||
|  |         comment_text = re.sub(r'`[^`]+`', 'CODE', message)      # Inline code | ||||||
|  |         comment_text = re.sub(r'```[\s\S]+?```', 'CODE', comment_text)  # Block code | ||||||
|  |         # 2. replace quotes with QUOTE | ||||||
|  |         lines = comment_text.split('\n') | ||||||
|  |         lines = ['QUOTE' if line.strip().startswith('>') else line for line in lines] | ||||||
|  |         comment_text = '\n'.join(lines) | ||||||
|  |         # 3. replace Gerrit URLs with GERRIT URL | ||||||
|  |         gerrit_url_pattern = r'https://gerrit\.wikimedia\.org/r/\d+' | ||||||
|  |         comment_text = re.sub(gerrit_url_pattern, 'GERRIT_URL', comment_text) | ||||||
|  |         # replace URL with URL | ||||||
|  |         url_pattern = r'https?://[^\s]+' | ||||||
|  |         comment_text = re.sub(url_pattern, 'URL', comment_text)  | ||||||
|  |         # 4. if possible, replace @ with SCREEN_NAME | ||||||
|  |         cleaned_message = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text) | ||||||
|  |         cleaned_messages.append(cleaned_message) | ||||||
|  | 
 | ||||||
|  |         message_label = calculate_tag_frequencies(cleaned_message, pipeline, config) | ||||||
|         mean_row = message_label.set_index('tag')['mean'] |         mean_row = message_label.set_index('tag')['mean'] | ||||||
|         mean_row = mean_row.rename(lambda tag: f"normalized_{tag}") |         mean_row = mean_row.rename(lambda tag: f"normalized_{tag}") | ||||||
|         features_list.append(mean_row) |         features_list.append(mean_row) | ||||||
|     print(len(features_list)) |     print(len(features_list)) | ||||||
|     frequencies_df = pd.DataFrame(features_list) |     frequencies_df = pd.DataFrame(features_list) | ||||||
|     frequencies_df['message'] = text |     frequencies_df['message'] = text | ||||||
|  |     frequencies_df['cleaned_messages'] = cleaned_messages | ||||||
|     frequencies_df = frequencies_df.reset_index(drop=True) |     frequencies_df = frequencies_df.reset_index(drop=True) | ||||||
|     return frequencies_df |     return frequencies_df | ||||||
|   |   | ||||||
| @ -85,6 +107,6 @@ if __name__ == "__main__": | |||||||
|     assert len(first_discussion_df) == len(final_discussion_df) |     assert len(first_discussion_df) == len(final_discussion_df) | ||||||
|     final_discussion_df = final_discussion_df.drop(columns=["message"]) |     final_discussion_df = final_discussion_df.drop(columns=["message"]) | ||||||
|     # if passing the prior asserts, let's write to a csv |     # if passing the prior asserts, let's write to a csv | ||||||
|     final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False) |     final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_biberplus_labels.csv", index=False) | ||||||
|     print('biberplus labeling pau') |     print('biberplus labeling pau') | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -74,15 +74,30 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab | |||||||
|         text_dict['task_title'] = row[1] |         text_dict['task_title'] = row[1] | ||||||
|         text_dict['comment_text'] = row[2] |         text_dict['comment_text'] = row[2] | ||||||
|         text_dict['comment_type'] = row[12] |         text_dict['comment_type'] = row[12] | ||||||
|  |         raw_text = text_dict['comment_text'] | ||||||
|          |          | ||||||
|         #TODO: comment_text preprocessing per https://arxiv.org/pdf/1902.07093 |         #print(raw_text) | ||||||
|  |         # comment_text preprocessing per https://arxiv.org/pdf/1902.07093 | ||||||
|         # 1. replace code with CODE |         # 1. replace code with CODE | ||||||
|  |         comment_text = re.sub(r'`[^`]+`', 'CODE', raw_text)      # Inline code | ||||||
|  |         comment_text = re.sub(r'```[\s\S]+?```', 'CODE', comment_text)  # Block code | ||||||
|         # 2. replace quotes with QUOTE |         # 2. replace quotes with QUOTE | ||||||
|         # 3. replace URLs with URL  |         lines = comment_text.split('\n') | ||||||
|  |         lines = ['QUOTE' if line.strip().startswith('>') else line for line in lines] | ||||||
|  |         comment_text = '\n'.join(lines) | ||||||
|  |         # 3. replace Gerrit URLs with GERRIT URL | ||||||
|  |         gerrit_url_pattern = r'https://gerrit\.wikimedia\.org/r/\d+' | ||||||
|  |         comment_text = re.sub(gerrit_url_pattern, 'GERRIT_URL', comment_text) | ||||||
|  |         # replace URL with URL | ||||||
|  |         url_pattern = r'https?://[^\s]+' | ||||||
|  |         comment_text = re.sub(url_pattern, 'URL', comment_text)  | ||||||
|         # 4. if possible, replace @ with SCREEN_NAME |         # 4. if possible, replace @ with SCREEN_NAME | ||||||
|  |         comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text) | ||||||
|  |         #print(comment_text) | ||||||
|          |          | ||||||
|  |         text_dict['cleaned_comment_text'] = comment_text | ||||||
|         #build out prompt construction; more specificity in data provided |         #build out prompt construction; more specificity in data provided | ||||||
|         given_data = f"**GIVEN COMMENT: \n ' Type -{text_dict['comment_type']} \n Text -{text_dict['comment_text']}**'\n" |         given_data = f"**GIVEN COMMENT: \n ' Type -{text_dict['comment_type']} \n Text -{text_dict['cleaned_comment_text']}**'\n" | ||||||
|         prompt_question="What do you think about this message? What are they saying?" |         prompt_question="What do you think about this message? What are they saying?" | ||||||
|         #prompt = f"{prompt_1}\n\n{example_1}\n\n{example_2}\n\n{example_3}\n\n{example_4}\n\n{given_data}\n" |         #prompt = f"{prompt_1}\n\n{example_1}\n\n{example_2}\n\n{example_3}\n\n{example_4}\n\n{given_data}\n" | ||||||
|         prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}" |         prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}" | ||||||
| @ -91,7 +106,8 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab | |||||||
|         #deterministic sampling and getting the response back  |         #deterministic sampling and getting the response back  | ||||||
|         response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False) |         response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False) | ||||||
|         response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0] |         response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0] | ||||||
|         print("this is the response:::: ----------------------------") |          | ||||||
|  |         #print("this is the response:::: ----------------------------") | ||||||
|         #print(response_txt) |         #print(response_txt) | ||||||
|         #getting the resulting codes  |         #getting the resulting codes  | ||||||
|         #codes_id = response_txt.rfind("CATEGORIES:") |         #codes_id = response_txt.rfind("CATEGORIES:") | ||||||
| @ -101,7 +117,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab | |||||||
|             following_text = match.group(1).strip("[]*") |             following_text = match.group(1).strip("[]*") | ||||||
|         else:  |         else:  | ||||||
|             following_text = "NO CATEGORY" |             following_text = "NO CATEGORY" | ||||||
|         print(following_text) |         #print(following_text) | ||||||
|         text_dict['olmo_category'] = following_text |         text_dict['olmo_category'] = following_text | ||||||
|         ''' |         ''' | ||||||
|         for item in result.strip(";").split(";"): |         for item in result.strip(";").split(";"): | ||||||
| @ -112,10 +128,8 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab | |||||||
|             cite_dict[key] = value |             cite_dict[key] = value | ||||||
|         ''' |         ''' | ||||||
|         array_of_categorizations.append(text_dict) |         array_of_categorizations.append(text_dict) | ||||||
|         if index > 40: |  | ||||||
|             break |  | ||||||
|     #CSV everything |     #CSV everything | ||||||
|     df = pd.DataFrame(array_of_categorizations) |     df = pd.DataFrame(array_of_categorizations) | ||||||
|     #df.to_csv('072525_olmo_messages_categorized.csv', index=False) |     df.to_csv('072525_olmo_messages_categorized.csv', index=False) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -8,7 +8,7 @@ | |||||||
| #SBATCH --mem=64G | #SBATCH --mem=64G | ||||||
| #SBATCH --cpus-per-task=4 | #SBATCH --cpus-per-task=4 | ||||||
| #SBATCH --job-name=biberplus-categorization  | #SBATCH --job-name=biberplus-categorization  | ||||||
| #SBATCH --output=biberplus-categorization.log | #SBATCH --output=cleaned_biberplus-categorization.log | ||||||
| #SBATCH --mail-type=BEGIN,END,FAIL | #SBATCH --mail-type=BEGIN,END,FAIL | ||||||
| #SBATCH --mail-user=gaughan@u.northwestern.edu | #SBATCH --mail-user=gaughan@u.northwestern.edu | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -24,3 +24,5 @@ conda activate olmo | |||||||
| echo "running the bertopic job at $(date)" | echo "running the bertopic job at $(date)" | ||||||
| 
 | 
 | ||||||
| python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/info_labeling.py | python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/info_labeling.py | ||||||
|  | 
 | ||||||
|  | echo "unsupervised olmo categorization pau at $(date)"     | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user