updating with biberplus implementation, though not quite solved yet
This commit is contained in:
parent
2e0665488c
commit
edd17d3269
3
p2/quest/biberplus-categorization.log
Normal file
3
p2/quest/biberplus-categorization.log
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
starting the job at: Tue Jul 22 16:43:27 CDT 2025
|
||||||
|
setting up the environment
|
||||||
|
running the biberplus labeling script
|
95
p2/quest/python_scripts/biberplus_labeling.py
Normal file
95
p2/quest/python_scripts/biberplus_labeling.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
'''
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||||
|
import random
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
MODEL_NAME = "Blablablab/neurobiber"
|
||||||
|
CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512
|
||||||
|
|
||||||
|
# List of the 96 features that Neurobiber can predict
|
||||||
|
BIBER_FEATURES = [
|
||||||
|
"BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
|
||||||
|
"BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
|
||||||
|
"BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
|
||||||
|
"BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
|
||||||
|
"BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
|
||||||
|
"BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
|
||||||
|
"BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
|
||||||
|
"BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
|
||||||
|
"BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
|
||||||
|
"BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
|
||||||
|
"BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
|
||||||
|
"BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
|
||||||
|
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
|
||||||
|
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
|
||||||
|
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
|
||||||
|
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
|
||||||
|
]
|
||||||
|
'''
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies
|
||||||
|
import cupy
|
||||||
|
import random
|
||||||
|
|
||||||
|
def biberplus_labeler(text):
|
||||||
|
print(len(text))
|
||||||
|
config = load_config()
|
||||||
|
config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
|
||||||
|
pipeline = load_pipeline(config)
|
||||||
|
#test = ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis']
|
||||||
|
features_list = []
|
||||||
|
for message in text:
|
||||||
|
message_label = calculate_tag_frequencies(message, pipeline, config)
|
||||||
|
mean_row = message_label.set_index('tag')['mean']
|
||||||
|
mean_row = mean_row.rename(lambda tag: f"normalized_{tag}")
|
||||||
|
features_list.append(mean_row)
|
||||||
|
print(len(features_list))
|
||||||
|
frequencies_df = pd.DataFrame(features_list)
|
||||||
|
frequencies_df['comment_text'] = text
|
||||||
|
frequencies_df = frequencies_df.reset_index(drop=True)
|
||||||
|
return frequencies_df
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
#https://huggingface.co/Blablablab/neurobiber
|
||||||
|
'''
|
||||||
|
docs = [
|
||||||
|
"First text goes here.",
|
||||||
|
"Second text, slightly different style."
|
||||||
|
]
|
||||||
|
'''
|
||||||
|
#loading in the discussion data from the universal CSV
|
||||||
|
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
||||||
|
#formatting for the neurobiber model
|
||||||
|
docs = first_discussion_df["comment_text"].astype(str).tolist()
|
||||||
|
#load model and run
|
||||||
|
#model, tokenizer = load_model_and_tokenizer()
|
||||||
|
preds_df = biberplus_labeler(docs)
|
||||||
|
#new columns in the df for the predicted neurobiber items
|
||||||
|
#preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
|
||||||
|
#preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
|
||||||
|
final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
|
||||||
|
#print(type(preds))
|
||||||
|
#assigning the preditions as a new column
|
||||||
|
'''
|
||||||
|
final_discussion_df = pd.merge(
|
||||||
|
first_discussion_df,
|
||||||
|
preds_df,
|
||||||
|
on='comment_text', # replace with your actual key
|
||||||
|
how='inner'
|
||||||
|
)
|
||||||
|
'''
|
||||||
|
print(len(final_discussion_df))
|
||||||
|
#final_discussion_df["biberplus_preds"] = list(preds)
|
||||||
|
#assert that order has been preserved
|
||||||
|
for _ in range(10):
|
||||||
|
random_index = random.choice(first_discussion_df.index)
|
||||||
|
assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
|
||||||
|
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
|
||||||
|
assert len(first_discussion_df) == len(final_discussion_df)
|
||||||
|
# if passing the prior asserts, let's write to a csv
|
||||||
|
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False)
|
||||||
|
print('biberplus labeling pau')
|
||||||
|
|
32
p2/quest/slurm_jobs/biberplus_label.sh
Normal file
32
p2/quest/slurm_jobs/biberplus_label.sh
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH -A p32852
|
||||||
|
#SBATCH -p gengpu
|
||||||
|
#SBATCH --gres=gpu:a100:1
|
||||||
|
#SBATCH --nodes=2
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --time=24:00:00
|
||||||
|
#SBATCH --mem=64G
|
||||||
|
#SBATCH --cpus-per-task=4
|
||||||
|
#SBATCH --job-name=biberplus-categorization
|
||||||
|
#SBATCH --output=biberplus-categorization.log
|
||||||
|
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||||
|
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||||
|
|
||||||
|
echo "starting the job at: $(date)"
|
||||||
|
|
||||||
|
echo "setting up the environment"
|
||||||
|
|
||||||
|
module purge
|
||||||
|
eval "$(conda shell.bash hook)"
|
||||||
|
conda activate neurobiber
|
||||||
|
|
||||||
|
echo "running the biberplus labeling script"
|
||||||
|
|
||||||
|
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/biberplus_labeling.py
|
||||||
|
|
||||||
|
echo "job finished, cleaning up"
|
||||||
|
|
||||||
|
conda deactivate
|
||||||
|
|
||||||
|
echo "job pau at: $(date)"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user