updating with some structure for discussion analysis stuff
This commit is contained in:
		
							parent
							
								
									68ec9c75f6
								
							
						
					
					
						commit
						8f2409feb0
					
				
							
								
								
									
										97
									
								
								p2/quest/python_scripts/neurobiber_labeling.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								p2/quest/python_scripts/neurobiber_labeling.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,97 @@ | ||||
| import torch | ||||
| import numpy as np | ||||
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | ||||
| 
 | ||||
| MODEL_NAME = "Blablablab/neurobiber" | ||||
| CHUNK_SIZE = 512  # Neurobiber was trained with max_length=512 | ||||
| 
 | ||||
| # List of the 96 features that Neurobiber can predict | ||||
| BIBER_FEATURES = [ | ||||
|     "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ", | ||||
|     "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT", | ||||
|     "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB", | ||||
|     "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP", | ||||
|     "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD", | ||||
|     "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2", | ||||
|     "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3", | ||||
|     "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB", | ||||
|     "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH", | ||||
|     "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP", | ||||
|     "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB", | ||||
|     "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ", | ||||
|     "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF", | ||||
|     "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP", | ||||
|     "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH", | ||||
|     "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X" | ||||
| ] | ||||
| 
 | ||||
| def load_model_and_tokenizer(): | ||||
|     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True) | ||||
|     model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to("cuda") | ||||
|     model.eval() | ||||
|     return model, tokenizer | ||||
| 
 | ||||
| def chunk_text(text, chunk_size=CHUNK_SIZE): | ||||
|     tokens = text.strip().split() | ||||
|     if not tokens: | ||||
|         return [] | ||||
|     return [" ".join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)] | ||||
| 
 | ||||
| def get_predictions_chunked_batch(model, tokenizer, texts, chunk_size=CHUNK_SIZE, subbatch_size=32): | ||||
|     chunked_texts = [] | ||||
|     chunk_indices = [] | ||||
|     for idx, text in enumerate(texts): | ||||
|         start = len(chunked_texts) | ||||
|         text_chunks = chunk_text(text, chunk_size) | ||||
|         chunked_texts.extend(text_chunks) | ||||
|         chunk_indices.append({ | ||||
|             'original_idx': idx, | ||||
|             'chunk_range': (start, start + len(text_chunks)) | ||||
|         }) | ||||
| 
 | ||||
|     # If there are no chunks (empty inputs), return zeros | ||||
|     if not chunked_texts: | ||||
|         return np.zeros((len(texts), model.config.num_labels)) | ||||
| 
 | ||||
|     all_chunk_preds = [] | ||||
|     for i in range(0, len(chunked_texts), subbatch_size): | ||||
|         batch_chunks = chunked_texts[i : i + subbatch_size] | ||||
|         encodings = tokenizer( | ||||
|             batch_chunks, | ||||
|             return_tensors='pt', | ||||
|             padding=True, | ||||
|             truncation=True, | ||||
|             max_length=chunk_size | ||||
|         ).to("cuda") | ||||
| 
 | ||||
|         with torch.no_grad(), torch.amp.autocast("cuda"): | ||||
|             outputs = model(**encodings) | ||||
|             probs = torch.sigmoid(outputs.logits) | ||||
|         all_chunk_preds.append(probs.cpu()) | ||||
| 
 | ||||
|     all_chunk_preds = torch.cat(all_chunk_preds, dim=0) if all_chunk_preds else torch.empty(0) | ||||
|     predictions = [None] * len(texts) | ||||
| 
 | ||||
|     for info in chunk_indices: | ||||
|         start, end = info['chunk_range'] | ||||
|         if start == end: | ||||
|             # No tokens => no features | ||||
|             pred = torch.zeros(model.config.num_labels) | ||||
|         else: | ||||
|             # Take max across chunks for each feature | ||||
|             chunk_preds = all_chunk_preds[start:end] | ||||
|             pred, _ = torch.max(chunk_preds, dim=0) | ||||
|         predictions[info['original_idx']] = (pred > 0.5).int().numpy() | ||||
| 
 | ||||
|     return np.array(predictions) | ||||
| 
 | ||||
| def predict_batch(model, tokenizer, texts, chunk_size=CHUNK_SIZE, subbatch_size=32): | ||||
|     return get_predictions_chunked_batch(model, tokenizer, texts, chunk_size, subbatch_size) | ||||
| 
 | ||||
| def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32): | ||||
|     batch_preds = predict_batch(model, tokenizer, [text], chunk_size, subbatch_size) | ||||
|     return batch_preds[0] | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     print("my brain hurts!") | ||||
|     #https://huggingface.co/Blablablab/neurobiber | ||||
							
								
								
									
										32
									
								
								p2/quest/slurm_jobs/neurobiber_label.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								p2/quest/slurm_jobs/neurobiber_label.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,32 @@ | ||||
| #!/bin/bash | ||||
| #SBATCH -A p32852 | ||||
| #SBATCH -p gengpu | ||||
| #SBATCH --gres=gpu:a100:1 | ||||
| #SBATCH --nodes=2 | ||||
| #SBATCH --ntasks-per-node=1 | ||||
| #SBATCH --time=24:00:00 | ||||
| #SBATCH --mem=64G | ||||
| #SBATCH --cpus-per-task=4 | ||||
| #SBATCH --job-name=neurobiber-categorization  | ||||
| #SBATCH --output=neurobiber-categorization.log | ||||
| #SBATCH --mail-type=BEGIN,END,FAIL | ||||
| #SBATCH --mail-user=gaughan@u.northwestern.edu | ||||
| 
 | ||||
| echo "starting the job at: $(date)" | ||||
| 
 | ||||
| echo "setting up the environment" | ||||
| 
 | ||||
| module purge | ||||
| eval "$(conda shell.bash hook)" | ||||
| conda activate neurobiber | ||||
| 
 | ||||
| echo "running the p1 categorization script" | ||||
| 
 | ||||
| python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py | ||||
| 
 | ||||
| echo "job finished, cleaning up" | ||||
| 
 | ||||
| conda deactivate | ||||
| 
 | ||||
| echo "job pau at: $(date)" | ||||
| 
 | ||||
							
								
								
									
										31
									
								
								p2/quest/slurm_jobs/unsupervised_categorizaton_job.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								p2/quest/slurm_jobs/unsupervised_categorizaton_job.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,31 @@ | ||||
| #!/bin/bash | ||||
| #SBATCH -A p32852 | ||||
| #SBATCH -p gengpu | ||||
| #SBATCH --gres=gpu:a100:1 | ||||
| #SBATCH --nodes=2 | ||||
| #SBATCH --ntasks-per-node=1 | ||||
| #SBATCH --time=24:00:00 | ||||
| #SBATCH --mem=64G | ||||
| #SBATCH --cpus-per-task=4 | ||||
| #SBATCH --job-name=p1-categorization  | ||||
| #SBATCH --output=p1-categorization.log | ||||
| #SBATCH --mail-type=BEGIN,END,FAIL | ||||
| #SBATCH --mail-user=gaughan@u.northwestern.edu | ||||
| 
 | ||||
| echo "starting the job at: $(date)" | ||||
| 
 | ||||
| echo "setting up the environment" | ||||
| 
 | ||||
| module purge | ||||
| eval "$(conda shell.bash hook)" | ||||
| conda activate olmo | ||||
| 
 | ||||
| echo "running the p1 categorization script" | ||||
| 
 | ||||
| 
 | ||||
| echo "job finished, cleaning up" | ||||
| 
 | ||||
| conda deactivate | ||||
| 
 | ||||
| echo "job pau at: $(date)" | ||||
| 
 | ||||
							
								
								
									
										6
									
								
								p2/quest/todo.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								p2/quest/todo.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,6 @@ | ||||
| [ ] generate clean rows for each comment from the discussion data  | ||||
| [ ] get data onto quest  | ||||
| [ ] run neurobiber over the data set, appending vectors onto the array  | ||||
| [ ] set up the unsupervised classification pipeline  | ||||
| [ ] iterate the prompt | ||||
| [ ] run the classification of the prompt  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user