updating with some structure for discussion analysis stuff

2025-07-11 16:13:26 -05:00 · 2025-07-11 16:13:26 -05:00 · 8f2409feb0
commit 8f2409feb0
parent 68ec9c75f6
4 changed files with 166 additions and 0 deletions
--- a/p2/quest/python_scripts/neurobiber_labeling.py
+++ b/p2/quest/python_scripts/neurobiber_labeling.py
@ -0,0 +1,97 @@
 import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 MODEL_NAME = "Blablablab/neurobiber"
 CHUNK_SIZE = 512  # Neurobiber was trained with max_length=512
 # List of the 96 features that Neurobiber can predict
 BIBER_FEATURES = [
    "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
    "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
    "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
    "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
    "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
    "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
    "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
    "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
    "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
    "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
    "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
    "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
    "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
    "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
    "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
    "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
 ]
 def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to("cuda")
    model.eval()
    return model, tokenizer
 def chunk_text(text, chunk_size=CHUNK_SIZE):
    tokens = text.strip().split()
    if not tokens:
        return []
    return [" ".join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
 def get_predictions_chunked_batch(model, tokenizer, texts, chunk_size=CHUNK_SIZE, subbatch_size=32):
    chunked_texts = []
    chunk_indices = []
    for idx, text in enumerate(texts):
        start = len(chunked_texts)
        text_chunks = chunk_text(text, chunk_size)
        chunked_texts.extend(text_chunks)
        chunk_indices.append({
            'original_idx': idx,
            'chunk_range': (start, start + len(text_chunks))
        })
    # If there are no chunks (empty inputs), return zeros
    if not chunked_texts:
        return np.zeros((len(texts), model.config.num_labels))
    all_chunk_preds = []
    for i in range(0, len(chunked_texts), subbatch_size):
        batch_chunks = chunked_texts[i : i + subbatch_size]
        encodings = tokenizer(
            batch_chunks,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=chunk_size
        ).to("cuda")
        with torch.no_grad(), torch.amp.autocast("cuda"):
            outputs = model(**encodings)
            probs = torch.sigmoid(outputs.logits)
        all_chunk_preds.append(probs.cpu())
    all_chunk_preds = torch.cat(all_chunk_preds, dim=0) if all_chunk_preds else torch.empty(0)
    predictions = [None] * len(texts)
    for info in chunk_indices:
        start, end = info['chunk_range']
        if start == end:
            # No tokens => no features
            pred = torch.zeros(model.config.num_labels)
        else:
            # Take max across chunks for each feature
            chunk_preds = all_chunk_preds[start:end]
            pred, _ = torch.max(chunk_preds, dim=0)
        predictions[info['original_idx']] = (pred > 0.5).int().numpy()
    return np.array(predictions)
 def predict_batch(model, tokenizer, texts, chunk_size=CHUNK_SIZE, subbatch_size=32):
    return get_predictions_chunked_batch(model, tokenizer, texts, chunk_size, subbatch_size)
 def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32):
    batch_preds = predict_batch(model, tokenizer, [text], chunk_size, subbatch_size)
    return batch_preds[0]
 if __name__ == "__main__":
    print("my brain hurts!")
    #https://huggingface.co/Blablablab/neurobiber
--- a/p2/quest/slurm_jobs/neurobiber_label.sh
+++ b/p2/quest/slurm_jobs/neurobiber_label.sh
@ -0,0 +1,32 @@
 #!/bin/bash
 #SBATCH -A p32852
 #SBATCH -p gengpu
 #SBATCH --gres=gpu:a100:1
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=1
 #SBATCH --time=24:00:00
 #SBATCH --mem=64G
 #SBATCH --cpus-per-task=4
 #SBATCH --job-name=neurobiber-categorization 
 #SBATCH --output=neurobiber-categorization.log
 #SBATCH --mail-type=BEGIN,END,FAIL
 #SBATCH --mail-user=gaughan@u.northwestern.edu
 echo "starting the job at: $(date)"
 echo "setting up the environment"
 module purge
 eval "$(conda shell.bash hook)"
 conda activate neurobiber
 echo "running the p1 categorization script"
 python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py
 echo "job finished, cleaning up"
 conda deactivate
 echo "job pau at: $(date)"
--- a/p2/quest/slurm_jobs/unsupervised_categorizaton_job.sh
+++ b/p2/quest/slurm_jobs/unsupervised_categorizaton_job.sh
@ -0,0 +1,31 @@
 #!/bin/bash
 #SBATCH -A p32852
 #SBATCH -p gengpu
 #SBATCH --gres=gpu:a100:1
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=1
 #SBATCH --time=24:00:00
 #SBATCH --mem=64G
 #SBATCH --cpus-per-task=4
 #SBATCH --job-name=p1-categorization 
 #SBATCH --output=p1-categorization.log
 #SBATCH --mail-type=BEGIN,END,FAIL
 #SBATCH --mail-user=gaughan@u.northwestern.edu
 echo "starting the job at: $(date)"
 echo "setting up the environment"
 module purge
 eval "$(conda shell.bash hook)"
 conda activate olmo
 echo "running the p1 categorization script"
 echo "job finished, cleaning up"
 conda deactivate
 echo "job pau at: $(date)"
--- a/p2/quest/todo.txt
+++ b/p2/quest/todo.txt
@ -0,0 +1,6 @@
 [ ] generate clean rows for each comment from the discussion data 
 [ ] get data onto quest 
 [ ] run neurobiber over the data set, appending vectors onto the array 
 [ ] set up the unsupervised classification pipeline 
 [ ] iterate the prompt
 [ ] run the classification of the prompt