updating with tentative neurobiber labels, need to verify outputs
This commit is contained in:
		
							parent
							
								
									c4dd45e344
								
							
						
					
					
						commit
						7e8fb1982b
					
				
							
								
								
									
										151688
									
								
								p2/quest/071425_neurobiber_labels.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151688
									
								
								p2/quest/071425_neurobiber_labels.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @ -1,6 +1,8 @@ | ||||
| import torch | ||||
| import numpy as np | ||||
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | ||||
| import random  | ||||
| import pandas as pd  | ||||
| 
 | ||||
| MODEL_NAME = "Blablablab/neurobiber" | ||||
| CHUNK_SIZE = 512  # Neurobiber was trained with max_length=512 | ||||
| @ -93,5 +95,32 @@ def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32 | ||||
|     return batch_preds[0] | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     print("my brain hurts!") | ||||
|     #https://huggingface.co/Blablablab/neurobiber | ||||
|     ''' | ||||
|     docs = [ | ||||
|     "First text goes here.", | ||||
|     "Second text, slightly different style." | ||||
|     ] | ||||
|     ''' | ||||
|     #loading in the discussion data from the universal CSV | ||||
|     first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") | ||||
|     #formatting for the neurobiber model | ||||
|     docs = first_discussion_df["comment_text"].astype(str).tolist() | ||||
|     #load model and run | ||||
|     model, tokenizer = load_model_and_tokenizer() | ||||
|     preds = predict_batch(model, tokenizer, docs) | ||||
|     #new columns in the df for the predicted neurobiber items  | ||||
|     preds_cols = [f"neurobiber_{i+1}" for i in range(96)] | ||||
|     preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index) | ||||
|     final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1) | ||||
|     #assert that order has been preserved  | ||||
|     for _ in range(10): | ||||
|         random_index = random.choice(first_discussion_df.index) | ||||
|         assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] | ||||
|     #assert that there are the same number of rows in first_discussion_df and second_discussion_df | ||||
|     assert len(first_discussion_df) == len(final_discussion_df) | ||||
|     # if passing the prior asserts, let's write to a csv | ||||
|     final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/071425_neurobiber_labels.csv", index=False) | ||||
|     print('neurobiber labeling pau') | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										6
									
								
								p2/quest/slurm_jobs/neurobiber-categorization.log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								p2/quest/slurm_jobs/neurobiber-categorization.log
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,6 @@ | ||||
| starting the job at: Mon Jul 14 15:25:44 CDT 2025 | ||||
| setting up the environment | ||||
| running the neurobiber labeling script | ||||
| neurobiber labeling pau | ||||
| job finished, cleaning up | ||||
| job pau at: Mon Jul 14 15:27:39 CDT 2025 | ||||
| @ -20,7 +20,7 @@ module purge | ||||
| eval "$(conda shell.bash hook)" | ||||
| conda activate neurobiber | ||||
| 
 | ||||
| echo "running the p1 categorization script" | ||||
| echo "running the neurobiber labeling script" | ||||
| 
 | ||||
| python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py | ||||
| 
 | ||||
|  | ||||
| @ -1,6 +1,6 @@ | ||||
| [ ] generate clean rows for each comment from the discussion data  | ||||
| [ ] get data onto quest  | ||||
| [ ] run neurobiber over the data set, appending vectors onto the array  | ||||
| [ x ] generate clean rows for each comment from the discussion data  | ||||
| [ x ] get data onto quest  | ||||
| [ x ] run neurobiber over the data set, appending vectors onto the array  | ||||
| [ ] set up the unsupervised classification pipeline  | ||||
| [ ] iterate the prompt | ||||
| [ ] run the classification of the prompt  | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user