diff --git a/p2/quest/090225_biber_pca_plot.png b/p2/quest/090225_biber_pca_plot.png new file mode 100644 index 0000000..492cd06 Binary files /dev/null and b/p2/quest/090225_biber_pca_plot.png differ diff --git a/p2/quest/neurobiber-pca.log b/p2/quest/neurobiber-pca.log new file mode 100644 index 0000000..1a857f7 --- /dev/null +++ b/p2/quest/neurobiber-pca.log @@ -0,0 +1,5 @@ +starting the job at: Tue Sep 2 15:27:43 CDT 2025 +setting up the environment +running the neurobiber labeling script +job finished, cleaning up +job pau at: Tue Sep 2 15:28:28 CDT 2025 diff --git a/p2/quest/python_scripts/neurobiber_PCA.py b/p2/quest/python_scripts/neurobiber_PCA.py new file mode 100644 index 0000000..3d8567e --- /dev/null +++ b/p2/quest/python_scripts/neurobiber_PCA.py @@ -0,0 +1,34 @@ +from sklearn.decomposition import PCA +from sklearn.preprocessing import LabelEncoder +import pandas as pd +#import torch +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + + +def format_df_data(df): + #this accounts for the somewhat idiosyncratic way that I saved my data + normalized_cols = [col for col in df.columns if col.startswith('normalized_')] + x = df[normalized_cols].astype(float).values + #x = np.vstack(df['features'].values) + return x + +if __name__ == "__main__": + biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False) + biber_vecs = format_df_data(biber_vec_df) + #handoff to PCA model + pca = PCA(2) + biber_vecs_pca = pca.fit_transform(biber_vecs) + + #first looking at comment_type + le = LabelEncoder() + colors = le.fit_transform(biber_vec_df['comment_type']) + + plt.scatter(biber_vecs_pca[:, 0], biber_vecs_pca[:, 1], + c=colors, edgecolor='none', alpha=0.5, cmap="viridis") + plt.xlabel('component 1') + plt.ylabel('component 2') + plt.colorbar() + + plt.savefig("090225_biber_pca_plot.png", dpi=300) diff --git a/p2/quest/slurm_jobs/pca_run.sh b/p2/quest/slurm_jobs/pca_run.sh new file mode 100644 index 0000000..46146a1 --- /dev/null +++ b/p2/quest/slurm_jobs/pca_run.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH -A p32852 +#SBATCH -p gengpu +#SBATCH --gres=gpu:a100:1 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --time=24:00:00 +#SBATCH --mem=64G +#SBATCH --cpus-per-task=4 +#SBATCH --job-name=neurobiber-pca +#SBATCH --output=neurobiber-pca.log +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=gaughan@u.northwestern.edu + +echo "starting the job at: $(date)" + +echo "setting up the environment" + +module purge +eval "$(conda shell.bash hook)" +conda activate neurobiber + +echo "running the neurobiber labeling script" + +python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_PCA.py + +echo "job finished, cleaning up" + +conda deactivate + +echo "job pau at: $(date)" +