updating with tentative neurobiber labels, need to verify outputs
This commit is contained in:
		
							parent
							
								
									c4dd45e344
								
							
						
					
					
						commit
						7e8fb1982b
					
				
							
								
								
									
										151688
									
								
								p2/quest/071425_neurobiber_labels.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151688
									
								
								p2/quest/071425_neurobiber_labels.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @ -1,6 +1,8 @@ | |||||||
| import torch | import torch | ||||||
| import numpy as np | import numpy as np | ||||||
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | from transformers import AutoTokenizer, AutoModelForSequenceClassification | ||||||
|  | import random  | ||||||
|  | import pandas as pd  | ||||||
| 
 | 
 | ||||||
| MODEL_NAME = "Blablablab/neurobiber" | MODEL_NAME = "Blablablab/neurobiber" | ||||||
| CHUNK_SIZE = 512  # Neurobiber was trained with max_length=512 | CHUNK_SIZE = 512  # Neurobiber was trained with max_length=512 | ||||||
| @ -93,5 +95,32 @@ def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32 | |||||||
|     return batch_preds[0] |     return batch_preds[0] | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     print("my brain hurts!") |  | ||||||
|     #https://huggingface.co/Blablablab/neurobiber |     #https://huggingface.co/Blablablab/neurobiber | ||||||
|  |     ''' | ||||||
|  |     docs = [ | ||||||
|  |     "First text goes here.", | ||||||
|  |     "Second text, slightly different style." | ||||||
|  |     ] | ||||||
|  |     ''' | ||||||
|  |     #loading in the discussion data from the universal CSV | ||||||
|  |     first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") | ||||||
|  |     #formatting for the neurobiber model | ||||||
|  |     docs = first_discussion_df["comment_text"].astype(str).tolist() | ||||||
|  |     #load model and run | ||||||
|  |     model, tokenizer = load_model_and_tokenizer() | ||||||
|  |     preds = predict_batch(model, tokenizer, docs) | ||||||
|  |     #new columns in the df for the predicted neurobiber items  | ||||||
|  |     preds_cols = [f"neurobiber_{i+1}" for i in range(96)] | ||||||
|  |     preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index) | ||||||
|  |     final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1) | ||||||
|  |     #assert that order has been preserved  | ||||||
|  |     for _ in range(10): | ||||||
|  |         random_index = random.choice(first_discussion_df.index) | ||||||
|  |         assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] | ||||||
|  |     #assert that there are the same number of rows in first_discussion_df and second_discussion_df | ||||||
|  |     assert len(first_discussion_df) == len(final_discussion_df) | ||||||
|  |     # if passing the prior asserts, let's write to a csv | ||||||
|  |     final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/071425_neurobiber_labels.csv", index=False) | ||||||
|  |     print('neurobiber labeling pau') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | |||||||
							
								
								
									
										6
									
								
								p2/quest/slurm_jobs/neurobiber-categorization.log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								p2/quest/slurm_jobs/neurobiber-categorization.log
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,6 @@ | |||||||
|  | starting the job at: Mon Jul 14 15:25:44 CDT 2025 | ||||||
|  | setting up the environment | ||||||
|  | running the neurobiber labeling script | ||||||
|  | neurobiber labeling pau | ||||||
|  | job finished, cleaning up | ||||||
|  | job pau at: Mon Jul 14 15:27:39 CDT 2025 | ||||||
| @ -20,7 +20,7 @@ module purge | |||||||
| eval "$(conda shell.bash hook)" | eval "$(conda shell.bash hook)" | ||||||
| conda activate neurobiber | conda activate neurobiber | ||||||
| 
 | 
 | ||||||
| echo "running the p1 categorization script" | echo "running the neurobiber labeling script" | ||||||
| 
 | 
 | ||||||
| python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py | python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,6 +1,6 @@ | |||||||
| [ ] generate clean rows for each comment from the discussion data  | [ x ] generate clean rows for each comment from the discussion data  | ||||||
| [ ] get data onto quest  | [ x ] get data onto quest  | ||||||
| [ ] run neurobiber over the data set, appending vectors onto the array  | [ x ] run neurobiber over the data set, appending vectors onto the array  | ||||||
| [ ] set up the unsupervised classification pipeline  | [ ] set up the unsupervised classification pipeline  | ||||||
| [ ] iterate the prompt | [ ] iterate the prompt | ||||||
| [ ] run the classification of the prompt  | [ ] run the classification of the prompt  | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user