1
0

updating with tentative neurobiber labels, need to verify outputs

This commit is contained in:
mgaughan 2025-07-14 15:38:23 -05:00
parent c4dd45e344
commit 7e8fb1982b
5 changed files with 151728 additions and 5 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,8 @@
import torch import torch
import numpy as np import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import AutoTokenizer, AutoModelForSequenceClassification
import random
import pandas as pd
MODEL_NAME = "Blablablab/neurobiber" MODEL_NAME = "Blablablab/neurobiber"
CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512 CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512
@ -93,5 +95,32 @@ def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32
return batch_preds[0] return batch_preds[0]
if __name__ == "__main__": if __name__ == "__main__":
print("my brain hurts!")
#https://huggingface.co/Blablablab/neurobiber #https://huggingface.co/Blablablab/neurobiber
'''
docs = [
"First text goes here.",
"Second text, slightly different style."
]
'''
#loading in the discussion data from the universal CSV
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
#formatting for the neurobiber model
docs = first_discussion_df["comment_text"].astype(str).tolist()
#load model and run
model, tokenizer = load_model_and_tokenizer()
preds = predict_batch(model, tokenizer, docs)
#new columns in the df for the predicted neurobiber items
preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
#assert that order has been preserved
for _ in range(10):
random_index = random.choice(first_discussion_df.index)
assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
assert len(first_discussion_df) == len(final_discussion_df)
# if passing the prior asserts, let's write to a csv
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/071425_neurobiber_labels.csv", index=False)
print('neurobiber labeling pau')

View File

@ -0,0 +1,6 @@
starting the job at: Mon Jul 14 15:25:44 CDT 2025
setting up the environment
running the neurobiber labeling script
neurobiber labeling pau
job finished, cleaning up
job pau at: Mon Jul 14 15:27:39 CDT 2025

View File

@ -20,7 +20,7 @@ module purge
eval "$(conda shell.bash hook)" eval "$(conda shell.bash hook)"
conda activate neurobiber conda activate neurobiber
echo "running the p1 categorization script" echo "running the neurobiber labeling script"
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py

View File

@ -1,6 +1,6 @@
[ ] generate clean rows for each comment from the discussion data [ x ] generate clean rows for each comment from the discussion data
[ ] get data onto quest [ x ] get data onto quest
[ ] run neurobiber over the data set, appending vectors onto the array [ x ] run neurobiber over the data set, appending vectors onto the array
[ ] set up the unsupervised classification pipeline [ ] set up the unsupervised classification pipeline
[ ] iterate the prompt [ ] iterate the prompt
[ ] run the classification of the prompt [ ] run the classification of the prompt