68 lines
2.4 KiB
Python
68 lines
2.4 KiB
Python
from sklearn.decomposition import PCA, KernelPCA
|
|
from sklearn.preprocessing import LabelEncoder
|
|
import pandas as pd
|
|
#import torch
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
|
|
def format_df_data(df):
|
|
#this accounts for the somewhat idiosyncratic way that I saved my data
|
|
normalized_cols = [col for col in df.columns if col.startswith('normalized_')]
|
|
x = df[normalized_cols].astype(float).values
|
|
#x = np.vstack(df['features'].values)
|
|
return x
|
|
|
|
if __name__ == "__main__":
|
|
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False)
|
|
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description']
|
|
biber_vecs = format_df_data(biber_vec_df)
|
|
#handoff to PCA model
|
|
'''
|
|
pca_trial = PCA()
|
|
biber_vecs_pca_trial = pca_trial.fit_transform(biber_vecs)
|
|
|
|
explained_variance = pca_trial.explained_variance_ratio_
|
|
cumulative_variance = np.cumsum(explained_variance)
|
|
|
|
n_components = np.argmax(cumulative_variance >= 0.90) + 1
|
|
print(f"Number of PCs explaining 90% variance: {n_components}")
|
|
'''
|
|
pca = PCA(n_components=18)
|
|
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
|
selected_axis = "source"
|
|
|
|
component_variances = np.var(biber_vecs_pca, axis=0)
|
|
print("Variance of each PCA component:", component_variances)
|
|
|
|
#first looking at comment_type
|
|
le = LabelEncoder()
|
|
colors = le.fit_transform(biber_vec_df[selected_axis])
|
|
|
|
plt.scatter(biber_vecs_pca[:, 0], biber_vecs_pca[:, 1],
|
|
c=colors, edgecolor='none', alpha=0.5, cmap="viridis")
|
|
plt.xlabel('component 1')
|
|
plt.ylabel('component 2')
|
|
plt.colorbar()
|
|
|
|
#plt.savefig("090225_biber_pca_plot.png", dpi=300)
|
|
|
|
plot_df = pd.DataFrame({
|
|
"PC1": biber_vecs_pca[:, 0],
|
|
"PC2": biber_vecs_pca[:, 1],
|
|
selected_axis: biber_vec_df[selected_axis].astype(str)
|
|
})
|
|
plt.figure(figsize=(8,6))
|
|
sns.scatterplot(
|
|
data=plot_df, x="PC1", y="PC2", hue="source",
|
|
palette="tab10", s=40, alpha=0.7, edgecolor=None
|
|
)
|
|
plt.xlabel('component 1')
|
|
plt.ylabel('component 2')
|
|
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
|
|
plt.tight_layout()
|
|
plt.savefig(f"{selected_axis}_090425_biber_kernelpca_affil.png", dpi=300)
|
|
plt.show()
|