mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_PCA.py

from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import LabelEncoder
import pandas as pd
#import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def format_df_data(df):
    #this accounts for the somewhat idiosyncratic way that I saved my data
    normalized_cols = [col for col in df.columns if col.startswith('normalized_')]
    x = df[normalized_cols].astype(float).values
    #x = np.vstack(df['features'].values)
    return x

if __name__ == "__main__":
    biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False)
    biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description']
    biber_vecs = format_df_data(biber_vec_df)
    #handoff to PCA model
    pca = KernelPCA(n_components=2, kernel="rbf")
    biber_vecs_pca = pca.fit_transform(biber_vecs)

    #first looking at comment_type
    le = LabelEncoder()
    colors = le.fit_transform(biber_vec_df['AuthorWMFAffil'])

    plt.scatter(biber_vecs_pca[:, 0], biber_vecs_pca[:, 1],
            c=colors, edgecolor='none', alpha=0.5, cmap="viridis")
    plt.xlabel('component 1')
    plt.ylabel('component 2')
    plt.colorbar()

    #plt.savefig("090225_biber_pca_plot.png", dpi=300)

    plot_df = pd.DataFrame({
        "PC1": biber_vecs_pca[:, 0],
        "PC2": biber_vecs_pca[:, 1],
        "AuthorWMFAffil": biber_vec_df["AuthorWMFAffil"].astype(str)
    })
    plt.figure(figsize=(8,6))
    sns.scatterplot(
        data=plot_df, x="PC1", y="PC2", hue="AuthorWMFAffil",
        palette="tab10", s=40, alpha=0.7, edgecolor=None
    )
    plt.xlabel('component 1')
    plt.ylabel('component 2')
    plt.legend(title='AuthorWMFAffil', bbox_to_anchor=(1.05, 1), loc=2)
    plt.tight_layout()
    plt.savefig("biber_kernelpca_affil.png", dpi=300)
    plt.show()