updating with tentative neurobiber labels, need to verify outputs

2025-07-14 15:38:23 -05:00 · 2025-07-14 15:38:23 -05:00 · 7e8fb1982b
commit 7e8fb1982b
parent c4dd45e344
5 changed files with 151728 additions and 5 deletions
--- a/p2/quest/071425_neurobiber_labels.csv
+++ b/p2/quest/071425_neurobiber_labels.csv
--- a/p2/quest/python_scripts/neurobiber_labeling.py
+++ b/p2/quest/python_scripts/neurobiber_labeling.py
@ -1,6 +1,8 @@
 import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import random 
+import pandas as pd 

 MODEL_NAME = "Blablablab/neurobiber"
 CHUNK_SIZE = 512  # Neurobiber was trained with max_length=512
@ -93,5 +95,32 @@ def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32
    return batch_preds[0]

 if __name__ == "__main__":
-    print("my brain hurts!")
    #https://huggingface.co/Blablablab/neurobiber
+    '''
+    docs = [
+    "First text goes here.",
+    "Second text, slightly different style."
+    ]
+    '''
+    #loading in the discussion data from the universal CSV
+    first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
+    #formatting for the neurobiber model
+    docs = first_discussion_df["comment_text"].astype(str).tolist()
+    #load model and run
+    model, tokenizer = load_model_and_tokenizer()
+    preds = predict_batch(model, tokenizer, docs)
+    #new columns in the df for the predicted neurobiber items 
+    preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
+    preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
+    final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
+    #assert that order has been preserved 
+    for _ in range(10):
+        random_index = random.choice(first_discussion_df.index)
+        assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
+    #assert that there are the same number of rows in first_discussion_df and second_discussion_df
+    assert len(first_discussion_df) == len(final_discussion_df)
+    # if passing the prior asserts, let's write to a csv
+    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/071425_neurobiber_labels.csv", index=False)
+    print('neurobiber labeling pau')
+
+
--- a/p2/quest/slurm_jobs/neurobiber-categorization.log
+++ b/p2/quest/slurm_jobs/neurobiber-categorization.log
@ -0,0 +1,6 @@
+starting the job at: Mon Jul 14 15:25:44 CDT 2025
+setting up the environment
+running the neurobiber labeling script
+neurobiber labeling pau
+job finished, cleaning up
+job pau at: Mon Jul 14 15:27:39 CDT 2025
--- a/p2/quest/slurm_jobs/neurobiber_label.sh
+++ b/p2/quest/slurm_jobs/neurobiber_label.sh
@ -20,7 +20,7 @@ module purge
 eval "$(conda shell.bash hook)"
 conda activate neurobiber

-echo "running the p1 categorization script"
+echo "running the neurobiber labeling script"

 python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py

--- a/p2/quest/todo.txt
+++ b/p2/quest/todo.txt
@ -1,6 +1,6 @@
-[ ] generate clean rows for each comment from the discussion data 
-[ ] get data onto quest 
-[ ] run neurobiber over the data set, appending vectors onto the array 
+[ x ] generate clean rows for each comment from the discussion data 
+[ x ] get data onto quest 
+[ x ] run neurobiber over the data set, appending vectors onto the array 
 [ ] set up the unsupervised classification pipeline 
 [ ] iterate the prompt
 [ ] run the classification of the prompt