updating with tentative neurobiber labels, need to verify outputs

2025-07-14 15:38:23 -05:00 · 2025-07-14 15:38:23 -05:00 · 7e8fb1982b
commit 7e8fb1982b
parent c4dd45e344
5 changed files with 151728 additions and 5 deletions
--- a/p2/quest/071425_neurobiber_labels.csv
+++ b/p2/quest/071425_neurobiber_labels.csv
--- a/p2/quest/python_scripts/neurobiber_labeling.py
+++ b/p2/quest/python_scripts/neurobiber_labeling.py
@ -1,6 +1,8 @@
 import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import random 
 import pandas as pd 
 MODEL_NAME = "Blablablab/neurobiber"
 CHUNK_SIZE = 512  # Neurobiber was trained with max_length=512
@ -93,5 +95,32 @@ def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32
    return batch_preds[0]
 if __name__ == "__main__":
    print("my brain hurts!")
    #https://huggingface.co/Blablablab/neurobiber
    '''
    docs = [
    "First text goes here.",
    "Second text, slightly different style."
    ]
    '''
    #loading in the discussion data from the universal CSV
    first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
    #formatting for the neurobiber model
    docs = first_discussion_df["comment_text"].astype(str).tolist()
    #load model and run
    model, tokenizer = load_model_and_tokenizer()
    preds = predict_batch(model, tokenizer, docs)
    #new columns in the df for the predicted neurobiber items 
    preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
    preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
    final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
    #assert that order has been preserved 
    for _ in range(10):
        random_index = random.choice(first_discussion_df.index)
        assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
    #assert that there are the same number of rows in first_discussion_df and second_discussion_df
    assert len(first_discussion_df) == len(final_discussion_df)
    # if passing the prior asserts, let's write to a csv
    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/071425_neurobiber_labels.csv", index=False)
    print('neurobiber labeling pau')
--- a/p2/quest/slurm_jobs/neurobiber-categorization.log
+++ b/p2/quest/slurm_jobs/neurobiber-categorization.log
@ -0,0 +1,6 @@
 starting the job at: Mon Jul 14 15:25:44 CDT 2025
 setting up the environment
 running the neurobiber labeling script
 neurobiber labeling pau
 job finished, cleaning up
 job pau at: Mon Jul 14 15:27:39 CDT 2025
--- a/p2/quest/slurm_jobs/neurobiber_label.sh
+++ b/p2/quest/slurm_jobs/neurobiber_label.sh
@ -20,7 +20,7 @@ module purge
 eval "$(conda shell.bash hook)"
 conda activate neurobiber
-echo "running the p1 categorization script"
+echo "running the neurobiber labeling script"
 python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py
--- a/p2/quest/todo.txt
+++ b/p2/quest/todo.txt
@ -1,6 +1,6 @@
-[ ] generate clean rows for each comment from the discussion data 
+[ x ] generate clean rows for each comment from the discussion data 
-[ ] get data onto quest 
+[ x ] get data onto quest 
-[ ] run neurobiber over the data set, appending vectors onto the array 
+[ x ] run neurobiber over the data set, appending vectors onto the array 
 [ ] set up the unsupervised classification pipeline 
 [ ] iterate the prompt
 [ ] run the classification of the prompt