mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_PCA.py

from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import LabelEncoder
import pandas as pd
#import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def format_df_data(df):
    #this accounts for the somewhat idiosyncratic way that I saved my data
    normalized_cols = [col for col in df.columns if col.startswith('normalized_')]
    x = df[normalized_cols].astype(float).values
    #x = np.vstack(df['features'].values)
    return x

if __name__ == "__main__":
    biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False)
    biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description']
    biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
    #biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
    biber_vecs = format_df_data(biber_vec_df)
    #handoff to PCA model
    '''
    pca_trial = PCA()
    biber_vecs_pca_trial = pca_trial.fit_transform(biber_vecs)

    explained_variance = pca_trial.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)

    n_components = np.argmax(cumulative_variance >= 0.90) + 1
    print(f"Number of PCs explaining 90% variance: {n_components}")
    '''
    pca = PCA(n_components=18)
    biber_vecs_pca = pca.fit_transform(biber_vecs)
    selected_axis = "AuthorWMFAffil"

    component_variances = np.var(biber_vecs_pca, axis=0)
    print("Variance of each PCA component:", component_variances)


    #first looking at comment_type
    le = LabelEncoder()
    colors = le.fit_transform(biber_vec_df[selected_axis])

    pc_dict = {f"PC{i+1}": biber_vecs_pca[:, i] for i in range(18)}
    pc_dict[selected_axis] = biber_vec_df[selected_axis].astype(str)
    pc_dict["source"] = biber_vec_df['source'].astype(str)
    pc_dict["phase"] = biber_vec_df['phase'].astype(str)
    pc_dict["text"] = biber_vec_df['comment_text'].astype(str)
    pc_dict['id'] = biber_vec_df['id']
    pc_dict['week_index'] = biber_vec_df['week_index']
    pc_dict['priority'] = biber_vec_df['priority']
    pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']

    plot_df = pd.DataFrame(pc_dict)
    plot_df.to_csv("090425_description_PCA_df.csv", index=False)

    print("Top 10 PC1 values:")
    print(plot_df.nlargest(10, "PC1"))
    print("\nBottom 10 PC1 values:")
    print(plot_df.nsmallest(10, "PC1"))

    print("Top 10 PC2 values:")
    print(plot_df.nlargest(10, "PC2"))
    print("\nBottom 10 PC2 values:")
    print(plot_df.nsmallest(10, "PC2"))


    g = sns.FacetGrid(plot_df, col="source", row="phase", hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False)
    g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40)
    g.add_legend(title=selected_axis)
    g.set_axis_labels("PC1", "PC2")
    g.fig.subplots_adjust(top=0.9)
    g.fig.suptitle(f"PCA by {selected_axis}, faceted by source")

    #plt.savefig("090225_biber_pca_plot.png", dpi=300)
    '''
    plot_df = pd.DataFrame({
        "PC1": biber_vecs_pca[:, 0],
        "PC2": biber_vecs_pca[:, 1],
        selected_axis: biber_vec_df[selected_axis].astype(str)
    })
    plt.figure(figsize=(8,6))
    sns.scatterplot(
        data=plot_df, x="PC1", y="PC2", hue="source",
        palette="tab10", s=40, alpha=0.7, edgecolor=None
    )
    plt.xlabel('component 1')
    plt.ylabel('component 2')
    plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
    '''
    g.fig.tight_layout()
    g.savefig(f"description_{selected_axis}_090425_biber_pca_final.png", dpi=300)
    plt.show()