first pass at implementing pca for the style vectors

2025-09-02 15:30:50 -05:00 · 2025-09-02 15:30:50 -05:00 · 89105b7660
commit 89105b7660
parent b714e8dedb
4 changed files with 71 additions and 0 deletions
--- a/p2/quest/090225_biber_pca_plot.png
+++ b/p2/quest/090225_biber_pca_plot.png
--- a/p2/quest/neurobiber-pca.log
+++ b/p2/quest/neurobiber-pca.log
@ -0,0 +1,5 @@
 starting the job at: Tue Sep  2 15:27:43 CDT 2025
 setting up the environment
 running the neurobiber labeling script
 job finished, cleaning up
 job pau at: Tue Sep  2 15:28:28 CDT 2025
--- a/p2/quest/python_scripts/neurobiber_PCA.py
+++ b/p2/quest/python_scripts/neurobiber_PCA.py
@ -0,0 +1,34 @@
 from sklearn.decomposition import PCA 
 from sklearn.preprocessing import LabelEncoder
 import pandas as pd 
 #import torch
 import numpy as np
 import pandas as pd 
 import matplotlib.pyplot as plt
 def format_df_data(df):
    #this accounts for the somewhat idiosyncratic way that I saved my data 
    normalized_cols = [col for col in df.columns if col.startswith('normalized_')]
    x = df[normalized_cols].astype(float).values
    #x = np.vstack(df['features'].values)
    return x
 if __name__ == "__main__":
    biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False)
    biber_vecs = format_df_data(biber_vec_df)
    #handoff to PCA model
    pca = PCA(2)  
    biber_vecs_pca = pca.fit_transform(biber_vecs)  
    #first looking at comment_type
    le = LabelEncoder()
    colors = le.fit_transform(biber_vec_df['comment_type'])
    plt.scatter(biber_vecs_pca[:, 0], biber_vecs_pca[:, 1],
            c=colors, edgecolor='none', alpha=0.5, cmap="viridis")
    plt.xlabel('component 1')
    plt.ylabel('component 2')
    plt.colorbar()
    plt.savefig("090225_biber_pca_plot.png", dpi=300) 
--- a/p2/quest/slurm_jobs/pca_run.sh
+++ b/p2/quest/slurm_jobs/pca_run.sh
@ -0,0 +1,32 @@
 #!/bin/bash
 #SBATCH -A p32852
 #SBATCH -p gengpu
 #SBATCH --gres=gpu:a100:1
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=1
 #SBATCH --time=24:00:00
 #SBATCH --mem=64G
 #SBATCH --cpus-per-task=4
 #SBATCH --job-name=neurobiber-pca 
 #SBATCH --output=neurobiber-pca.log
 #SBATCH --mail-type=BEGIN,END,FAIL
 #SBATCH --mail-user=gaughan@u.northwestern.edu
 echo "starting the job at: $(date)"
 echo "setting up the environment"
 module purge
 eval "$(conda shell.bash hook)"
 conda activate neurobiber
 echo "running the neurobiber labeling script"
 python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_PCA.py
 echo "job finished, cleaning up"
 conda deactivate
 echo "job pau at: $(date)"