from sklearn.decomposition import PCA, KernelPCA from sklearn.preprocessing import LabelEncoder import pandas as pd #import torch import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns def format_df_data(df): #this accounts for the somewhat idiosyncratic way that I saved my data normalized_cols = [col for col in df.columns if col.startswith('normalized_')] x = df[normalized_cols].astype(float).values #x = np.vstack(df['features'].values) return x if __name__ == "__main__": biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False) biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description'] biber_vecs = format_df_data(biber_vec_df) #handoff to PCA model ''' pca_trial = PCA() biber_vecs_pca_trial = pca_trial.fit_transform(biber_vecs) explained_variance = pca_trial.explained_variance_ratio_ cumulative_variance = np.cumsum(explained_variance) n_components = np.argmax(cumulative_variance >= 0.90) + 1 print(f"Number of PCs explaining 90% variance: {n_components}") ''' pca = PCA(n_components=18) biber_vecs_pca = pca.fit_transform(biber_vecs) selected_axis = "source" component_variances = np.var(biber_vecs_pca, axis=0) print("Variance of each PCA component:", component_variances) #first looking at comment_type le = LabelEncoder() colors = le.fit_transform(biber_vec_df[selected_axis]) plt.scatter(biber_vecs_pca[:, 0], biber_vecs_pca[:, 1], c=colors, edgecolor='none', alpha=0.5, cmap="viridis") plt.xlabel('component 1') plt.ylabel('component 2') plt.colorbar() #plt.savefig("090225_biber_pca_plot.png", dpi=300) plot_df = pd.DataFrame({ "PC1": biber_vecs_pca[:, 0], "PC2": biber_vecs_pca[:, 1], selected_axis: biber_vec_df[selected_axis].astype(str) }) plt.figure(figsize=(8,6)) sns.scatterplot( data=plot_df, x="PC1", y="PC2", hue="source", palette="tab10", s=40, alpha=0.7, edgecolor=None ) plt.xlabel('component 1') plt.ylabel('component 2') plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2) plt.tight_layout() plt.savefig(f"{selected_axis}_090425_biber_kernelpca_affil.png", dpi=300) plt.show()