diff --git a/p2/quest/neurobiber-pca.log b/p2/quest/neurobiber-pca.log index 0bc7040..d8e1cb2 100644 --- a/p2/quest/neurobiber-pca.log +++ b/p2/quest/neurobiber-pca.log @@ -1,9 +1,8 @@ -starting the job at: Thu Sep 4 10:09:58 CDT 2025 +starting the job at: Thu Sep 4 10:23:23 CDT 2025 setting up the environment running the neurobiber labeling script -Number of PCs explaining 90% variance: 18 Variance of each PCA component: [88.92832185 39.46471687 32.34601523 20.19544345 14.0083261 11.5837521 7.82584723 6.89064989 6.07988254 5.80726367 5.49782354 4.50587747 4.31482409 2.81997326 2.62989708 2.27205352 2.09396341 2.00076119] job finished, cleaning up -job pau at: Thu Sep 4 10:10:21 CDT 2025 +job pau at: Thu Sep 4 10:23:47 CDT 2025 diff --git a/p2/quest/phase_090425_biber_kernelpca_affil.png b/p2/quest/phase_090425_biber_kernelpca_affil.png new file mode 100644 index 0000000..872d105 Binary files /dev/null and b/p2/quest/phase_090425_biber_kernelpca_affil.png differ diff --git a/p2/quest/python_scripts/neurobiber_PCA.py b/p2/quest/python_scripts/neurobiber_PCA.py index 9c9b5ec..181ac74 100644 --- a/p2/quest/python_scripts/neurobiber_PCA.py +++ b/p2/quest/python_scripts/neurobiber_PCA.py @@ -32,7 +32,7 @@ if __name__ == "__main__": ''' pca = PCA(n_components=18) biber_vecs_pca = pca.fit_transform(biber_vecs) - selected_axis = "source" + selected_axis = "phase" component_variances = np.var(biber_vecs_pca, axis=0) print("Variance of each PCA component:", component_variances) @@ -41,14 +41,23 @@ if __name__ == "__main__": le = LabelEncoder() colors = le.fit_transform(biber_vec_df[selected_axis]) - plt.scatter(biber_vecs_pca[:, 0], biber_vecs_pca[:, 1], - c=colors, edgecolor='none', alpha=0.5, cmap="viridis") - plt.xlabel('component 1') - plt.ylabel('component 2') - plt.colorbar() + plot_df = pd.DataFrame({ + "PC1": biber_vecs_pca[:, 0], + "PC2": biber_vecs_pca[:, 1], + selected_axis: biber_vec_df[selected_axis].astype(str), + "source":biber_vec_df['source'].astype(str) + }) + + + g = sns.FacetGrid(plot_df, col="source", col_wrap=4, hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False) + g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40) + g.add_legend(title=selected_axis) + g.set_axis_labels("PC1", "PC2") + g.fig.subplots_adjust(top=0.9) + g.fig.suptitle(f"PCA by {selected_axis}, faceted by source") #plt.savefig("090225_biber_pca_plot.png", dpi=300) - + ''' plot_df = pd.DataFrame({ "PC1": biber_vecs_pca[:, 0], "PC2": biber_vecs_pca[:, 1], @@ -62,6 +71,7 @@ if __name__ == "__main__": plt.xlabel('component 1') plt.ylabel('component 2') plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2) - plt.tight_layout() - plt.savefig(f"{selected_axis}_090425_biber_kernelpca_affil.png", dpi=300) + ''' + g.fig.tight_layout() + g.savefig(f"{selected_axis}_090425_biber_kernelpca_affil.png", dpi=300) plt.show() diff --git a/p2/quest/python_scripts/olmo_parallel_cat.py b/p2/quest/python_scripts/olmo_parallel_cat.py index f20c6ce..b71ece0 100644 --- a/p2/quest/python_scripts/olmo_parallel_cat.py +++ b/p2/quest/python_scripts/olmo_parallel_cat.py @@ -14,7 +14,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM import csv import pandas as pd import re + import nltk +nltk.download('punkt') # ----------------- prompts for LLM priming = "For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference. Your task is to match the**GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the category as your output. Do not provide any text beyond the category name."