126 lines
5.1 KiB
Python
126 lines
5.1 KiB
Python
from sklearn.decomposition import PCA, KernelPCA
|
|
from sklearn.preprocessing import LabelEncoder
|
|
import pandas as pd
|
|
#import torch
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
import pickle
|
|
|
|
# List of the 96 features that Neurobiber can predict
|
|
BIBER_FEATURES = [
|
|
"BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
|
|
"BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
|
|
"BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
|
|
"BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
|
|
"BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
|
|
"BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
|
|
"BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
|
|
"BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
|
|
"BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
|
|
"BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
|
|
"BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
|
|
"BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
|
|
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
|
|
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
|
|
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
|
|
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
|
|
]
|
|
|
|
|
|
|
|
def format_df_data(df):
|
|
#this accounts for the somewhat idiosyncratic way that I saved my data
|
|
normalized_cols = [col for col in df.columns if col.startswith('normalized_')]
|
|
x = df[normalized_cols].astype(float).values
|
|
#x = np.vstack(df['features'].values)
|
|
return x
|
|
|
|
if __name__ == "__main__":
|
|
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False)
|
|
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description']
|
|
biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
|
|
#biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
|
|
biber_vecs = format_df_data(biber_vec_df)
|
|
#handoff to PCA model
|
|
'''
|
|
pca_trial = PCA()
|
|
biber_vecs_pca_trial = pca_trial.fit_transform(biber_vecs)
|
|
|
|
explained_variance = pca_trial.explained_variance_ratio_
|
|
cumulative_variance = np.cumsum(explained_variance)
|
|
|
|
n_components = np.argmax(cumulative_variance >= 0.90) + 1
|
|
print(f"Number of PCs explaining 90% variance: {n_components}")
|
|
'''
|
|
pca = PCA(n_components=18)
|
|
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
|
with open('092325_pca.pkl', 'wb') as f:
|
|
pickle.dump(pca, f)
|
|
selected_axis = "AuthorWMFAffil"
|
|
|
|
component_variances = np.var(biber_vecs_pca, axis=0)
|
|
print("Variance of each PCA component:", component_variances)
|
|
|
|
for i, component in enumerate(pca.components_):
|
|
print(f"PC{i+1}:")
|
|
indices = np.argsort(np.abs(component))[::-1]
|
|
for idx in indices[:10]: # Top 10
|
|
print(f" {BIBER_FEATURES[idx]}: {component[idx]:.3f}")
|
|
|
|
#first looking at comment_type
|
|
le = LabelEncoder()
|
|
colors = le.fit_transform(biber_vec_df[selected_axis])
|
|
|
|
pc_dict = {f"PC{i+1}": biber_vecs_pca[:, i] for i in range(18)}
|
|
pc_dict[selected_axis] = biber_vec_df[selected_axis].astype(str)
|
|
pc_dict["source"] = biber_vec_df['source'].astype(str)
|
|
pc_dict["phase"] = biber_vec_df['phase'].astype(str)
|
|
pc_dict["text"] = biber_vec_df['comment_text'].astype(str)
|
|
pc_dict['id'] = biber_vec_df['id']
|
|
pc_dict['week_index'] = biber_vec_df['week_index']
|
|
pc_dict['priority'] = biber_vec_df['priority']
|
|
pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']
|
|
|
|
plot_df = pd.DataFrame(pc_dict)
|
|
plot_df.to_csv("092325_description_PCA_df.csv", index=False)
|
|
|
|
print("Top 10 PC1 values:")
|
|
print(plot_df.nlargest(10, "PC1"))
|
|
print("\nBottom 10 PC1 values:")
|
|
print(plot_df.nsmallest(10, "PC1"))
|
|
|
|
print("Top 10 PC2 values:")
|
|
print(plot_df.nlargest(10, "PC2"))
|
|
print("\nBottom 10 PC2 values:")
|
|
print(plot_df.nsmallest(10, "PC2"))
|
|
|
|
|
|
g = sns.FacetGrid(plot_df, col="source", row="phase", hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False)
|
|
g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40)
|
|
g.add_legend(title=selected_axis)
|
|
g.set_axis_labels("PC1", "PC2")
|
|
g.fig.subplots_adjust(top=0.9)
|
|
g.fig.suptitle(f"PCA by {selected_axis}, faceted by source")
|
|
|
|
#plt.savefig("090225_biber_pca_plot.png", dpi=300)
|
|
'''
|
|
plot_df = pd.DataFrame({
|
|
"PC1": biber_vecs_pca[:, 0],
|
|
"PC2": biber_vecs_pca[:, 1],
|
|
selected_axis: biber_vec_df[selected_axis].astype(str)
|
|
})
|
|
plt.figure(figsize=(8,6))
|
|
sns.scatterplot(
|
|
data=plot_df, x="PC1", y="PC2", hue="source",
|
|
palette="tab10", s=40, alpha=0.7, edgecolor=None
|
|
)
|
|
plt.xlabel('component 1')
|
|
plt.ylabel('component 2')
|
|
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
|
|
'''
|
|
g.fig.tight_layout()
|
|
g.savefig(f"description_{selected_axis}_092325_biber_pca_final.png", dpi=300)
|
|
plt.show()
|