from sklearn.decomposition import PCA from sklearn.preprocessing import LabelEncoder import pandas as pd #import torch import numpy as np import pandas as pd import matplotlib.pyplot as plt def format_df_data(df): #this accounts for the somewhat idiosyncratic way that I saved my data normalized_cols = [col for col in df.columns if col.startswith('normalized_')] x = df[normalized_cols].astype(float).values #x = np.vstack(df['features'].values) return x if __name__ == "__main__": biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False) biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description'] biber_vecs = format_df_data(biber_vec_df) #handoff to PCA model pca = PCA(2) biber_vecs_pca = pca.fit_transform(biber_vecs) #first looking at comment_type le = LabelEncoder() colors = le.fit_transform(biber_vec_df['AuthorWMFAffil']) plt.scatter(biber_vecs_pca[:, 0], biber_vecs_pca[:, 1], c=colors, edgecolor='none', alpha=0.5, cmap="viridis") plt.xlabel('component 1') plt.ylabel('component 2') plt.colorbar() plt.savefig("090225_biber_pca_plot.png", dpi=300)