mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_PCA.py

from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import LabelEncoder
import pandas as pd
#import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import ast

# List of the 96 features that Neurobiber can predict
BIBER_FEATURES = [
    "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
    "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
    "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
    "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
    "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
    "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
    "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
    "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
    "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
    "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
    "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
    "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
    "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
    "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
    "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
    "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X",
    "sentence_count", "median_sentence_length"
]

selected_cols = [
    "normalized_QUAN","normalized_QUPR","normalized_AMP","normalized_PASS","normalized_XX0","normalized_JJ",
    "normalized_BEMA","normalized_CAUS","normalized_CONC","normalized_COND","normalized_CONJ","normalized_CONT",
    "normalized_DPAR","normalized_DWNT","normalized_EX","normalized_FPP1","normalized_GER","normalized_RB",
    "normalized_PIN","normalized_INPR","normalized_TO","normalized_NEMD","normalized_OSUB","normalized_PASTP",
    "normalized_VBD","normalized_PHC","normalized_PIRE","normalized_PLACE","normalized_POMD","normalized_PRMD",
    "normalized_WZPRES","normalized_VPRT","normalized_PRIV","normalized_PIT","normalized_PUBV","normalized_SPP2",
    "normalized_SMP","normalized_SERE","normalized_STPR","normalized_SUAV","normalized_SYNE","normalized_TPP3",
    "normalized_TIME","normalized_NOMZ","normalized_BYPA","normalized_PRED","normalized_TOBJ","normalized_TSUB",
    "normalized_THVC","normalized_NN","normalized_DEMP","normalized_DEMO","normalized_WHQU","normalized_EMPH",
    "normalized_HDG","normalized_WZPAST","normalized_THAC","normalized_PEAS","normalized_ANDC","normalized_PRESP",
    "normalized_PROD","normalized_SPAU","normalized_SPIN","normalized_THATD","normalized_WHOBJ","normalized_WHSUB",
    "normalized_WHCL","normalized_ART","normalized_AUXB","normalized_CAP","normalized_SCONJ","normalized_CCONJ",
    "normalized_DET","normalized_EMOJ","normalized_EMOT","normalized_EXCL","normalized_HASH","normalized_INF",
    "normalized_UH","normalized_NUM","normalized_LAUGH","normalized_PRP","normalized_PREP","normalized_NNP",
    "normalized_QUES","normalized_QUOT","normalized_AT","normalized_SBJP","normalized_URL","normalized_WH",
    "normalized_INDA","normalized_ACCU","normalized_PGAS","normalized_CMADJ","normalized_SPADJ","normalized_X",
    "normalized_AWL", "normalized_TTR","sentence_count", "median_sentence_length"
]


def safe_parse(x):
    # If NaN or float, treat as empty list
    if isinstance(x, float) and np.isnan(x):
        return []
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return []
    if isinstance(x, list):
        return x
    return []

def format_df_data(df):
    #this accounts for the somewhat idiosyncratic way that I saved my data
    normalized_cols = [col for col in df.columns if col.startswith('normalized_')]

    #selected_features = [col for col in df.columns if col in selected_cols]
    x = df[normalized_cols].astype(float).values

    #101325_additions to account for length
    df['olmo_cleaned_sentences'] = df['olmo_cleaned_sentences'].apply(safe_parse)
    print(df['olmo_cleaned_sentences'])
    sentence_count = df['olmo_cleaned_sentences'].apply(len).values.reshape(-1, 1)

    median_sentence_length = df['olmo_cleaned_sentences'].apply(
        lambda sents: np.median([len(sent.split()) for sent in sents]) if len(sents) > 0 else 0
    ).values.reshape(-1, 1)
    print(median_sentence_length)
    x = np.hstack([x, sentence_count, median_sentence_length])
    #x = np.vstack(df['features'].values)
    return x

if __name__ == "__main__":
    biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", low_memory=False)
    #biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
    #biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
    #biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
    biber_vecs = format_df_data(biber_vec_df)
    #handoff to PCA model

    pca_trial = PCA()
    biber_vecs_pca_trial = pca_trial.fit_transform(biber_vecs)

    explained_variance = pca_trial.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)

    argmax_components = np.argmax(cumulative_variance >= 0.90) + 1
    print(f"Number of PCs explaining 90% variance: {argmax_components}")

    pca = PCA(n_components=argmax_components)
    biber_vecs_pca = pca.fit_transform(biber_vecs)
    with open('102025_total_pca.pkl', 'wb') as f:
        pickle.dump(pca, f)
    selected_axis = "AuthorWMFAffil"

    component_variances = np.var(biber_vecs_pca, axis=0)
    print("Variance of each PCA component:", component_variances)

    for i, component in enumerate(pca.components_):
        print(f"PC{i+1}:")
        indices = np.argsort(np.abs(component))[::-1]
        for idx in indices[:10]:  # Top 10
            print(f"{selected_cols[idx]}: {component[idx]:.3f}")

    #first looking at comment_type
    #le = LabelEncoder()
    #colors = le.fit_transform(biber_vec_df[selected_axis])

    pc_dict = {f"PC{i+1}": biber_vecs_pca[:, i] for i in range(argmax_components)}
    #pc_dict[selected_axis] = biber_vec_df[selected_axis].astype(str)
    pc_dict["source"] = biber_vec_df['source'].astype(str)
    pc_dict["phase"] = biber_vec_df['phase'].astype(str)
    pc_dict["text"] = biber_vec_df['comment_text'].astype(str)
    pc_dict['id'] = biber_vec_df['id']
    pc_dict['week_index'] = biber_vec_df['week_index']
    pc_dict['priority'] = biber_vec_df['priority']
    pc_dict['resolution_outcome'] = biber_vec_df['resolution_outcome']
    pc_dict['TaskPHID'] = biber_vec_df['TaskPHID']
    pc_dict['AuthorPHID'] = biber_vec_df['AuthorPHID']
    pc_dict['date_created'] = biber_vec_df['date_created']
    pc_dict['comment_type'] = biber_vec_df['comment_type']


    plot_df = pd.DataFrame(pc_dict)
    plot_df.to_csv("102025_total_pca_df.csv", index=False)

    print("Top 10 PC1 values:")
    print(plot_df.nlargest(10, "PC1"))
    print("\nBottom 10 PC1 values:")
    print(plot_df.nsmallest(10, "PC1"))

    print("Top 10 PC2 values:")
    print(plot_df.nlargest(10, "PC2"))
    print("\nBottom 10 PC2 values:")
    print(plot_df.nsmallest(10, "PC2"))


    #g = sns.FacetGrid(plot_df, col="source", row="phase", hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False)
    #g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40)
    #g.add_legend(title=selected_axis)
    #g.set_axis_labels("PC1", "PC2")
    #g.fig.subplots_adjust(top=0.9)
    #g.fig.suptitle(f"PCA by {selected_axis}, faceted by source")

    #plt.savefig("090225_biber_pca_plot.png", dpi=300)
    '''
    #g.fig.tight_layout()
    #g.savefig(f"subcomment_{selected_axis}_100125_biber_pca_final.png", dpi=300)
    #plt.show()
    '''