1
0
mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_PCA.py
2025-09-02 16:04:06 -05:00

54 lines
1.8 KiB
Python

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import pandas as pd
#import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def format_df_data(df):
#this accounts for the somewhat idiosyncratic way that I saved my data
normalized_cols = [col for col in df.columns if col.startswith('normalized_')]
x = df[normalized_cols].astype(float).values
#x = np.vstack(df['features'].values)
return x
if __name__ == "__main__":
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False)
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description']
biber_vecs = format_df_data(biber_vec_df)
#handoff to PCA model
pca = PCA(2)
biber_vecs_pca = pca.fit_transform(biber_vecs)
#first looking at comment_type
le = LabelEncoder()
colors = le.fit_transform(biber_vec_df['AuthorWMFAffil'])
plt.scatter(biber_vecs_pca[:, 0], biber_vecs_pca[:, 1],
c=colors, edgecolor='none', alpha=0.5, cmap="viridis")
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar()
plt.savefig("090225_biber_pca_plot.png", dpi=300)
plot_df = pd.DataFrame({
"PC1": biber_vecs_pca[:, 0],
"PC2": biber_vecs_pca[:, 1],
"AuthorWMFAffil": biber_vec_df["AuthorWMFAffil"].astype(str)
})
plt.figure(figsize=(8,6))
sns.scatterplot(
data=plot_df, x="PC1", y="PC2", hue="AuthorWMFAffil",
palette="tab10", s=40, alpha=0.7, edgecolor=None
)
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.legend(title='AuthorWMFAffil', bbox_to_anchor=(1.05, 1), loc=2)
plt.tight_layout()
plt.savefig("biber_pca_affil.png", dpi=300)
plt.show()