diff --git a/p2/quest/cleaned_biberplus-categorization.log b/p2/quest/cleaned_biberplus-categorization.log index d9d1ada..205d63c 100644 --- a/p2/quest/cleaned_biberplus-categorization.log +++ b/p2/quest/cleaned_biberplus-categorization.log @@ -1,36 +1,3 @@ -starting the job at: Fri Jul 25 20:48:01 CDT 2025 +starting the job at: Tue Sep 23 16:37:07 CDT 2025 setting up the environment running the biberplus labeling script -26024 -26024 - id ... http_flag -0 56791 ... NaN -1 269631 ... NaN -2 269628 ... NaN -3 269622 ... NaN -4 56737 ... NaN -... ... ... ... -26019 403186 ... True -26020 78646 ... True -26021 429163 ... True -26022 429137 ... True -26023 418783 ... True - -[26024 rows x 22 columns] - id ... cleaned_messages -0 56791 ... pawn character editing\n\nseen on master branc... -1 269631 ... Change 86685 merged by jenkins-bot:\nFollow-up... -2 269628 ... *** Bug 54785 has been marked as a duplicate o... -3 269622 ... Change 86685 had a related patch set uploaded ... -4 56737 ... **Author:** CODE\n\n**Description:**\nAfter th... -... ... ... ... -26019 403186 ... Could you attach a screenshot please? Drag & d... -26020 78646 ... Hi,\n\nWe have a wiki which has a part which c... -26021 429163 ... Sorry for not reply-ing. I did a test and coul... -26022 429137 ... SCREEN_NAME: Please answer. -26023 418783 ... I cannot replicate this. What's the name of th... - -[26024 rows x 122 columns] -biberplus labeling pau -job finished, cleaning up -job pau at: Fri Jul 25 20:55:26 CDT 2025 diff --git a/p2/quest/neurobiber-pca.log b/p2/quest/neurobiber-pca.log index 428e7ae..eaf385f 100644 --- a/p2/quest/neurobiber-pca.log +++ b/p2/quest/neurobiber-pca.log @@ -1,9 +1,207 @@ -starting the job at: Thu Sep 4 15:41:55 CDT 2025 +starting the job at: Tue Sep 23 16:37:06 CDT 2025 setting up the environment running the neurobiber labeling script Variance of each PCA component: [88.92832185 39.46471687 32.34601523 20.19544345 14.0083261 11.5837521 7.82584723 6.89064989 6.07988254 5.80726367 5.49782354 4.50587747 4.31482409 2.81997326 2.62989708 2.27205352 2.09396341 2.00076119] +PC1: + BIN_NNP: 0.760 + BIN_CAP: 0.524 + BIN_DET: -0.166 + BIN_PREP: -0.157 + BIN_PIN: -0.157 + BIN_ART: -0.126 + BIN_NN: -0.119 + BIN_RB: -0.076 + BIN_INF: -0.070 + BIN_VPRT: -0.069 +PC2: + BIN_PREP: 0.473 + BIN_PIN: 0.473 + BIN_NNP: 0.426 + BIN_DET: 0.323 + BIN_ART: 0.240 + BIN_NOMZ: -0.233 + BIN_VPRT: 0.142 + BIN_RB: 0.132 + BIN_SBJP: 0.119 + BIN_PRP: 0.119 +PC3: + BIN_CAP: 0.727 + BIN_NN: 0.546 + BIN_NNP: -0.363 + BIN_PREP: 0.102 + BIN_PIN: 0.102 + BIN_DET: 0.058 + BIN_ART: 0.056 + BIN_SBJP: -0.048 + BIN_PRP: -0.048 + BIN_PRIV: 0.036 +PC4: + BIN_NN: 0.659 + BIN_CAP: -0.391 + BIN_PRP: -0.260 + BIN_SBJP: -0.260 + BIN_NNP: 0.247 + BIN_RB: -0.236 + BIN_ART: 0.141 + BIN_FPP1: -0.130 + BIN_INF: -0.128 + BIN_PREP: -0.127 +PC5: + BIN_DET: 0.485 + BIN_ART: 0.422 + BIN_PIN: -0.421 + BIN_PREP: -0.421 + BIN_RB: 0.245 + BIN_VPRT: 0.196 + BIN_INDA: 0.142 + BIN_NOMZ: -0.123 + BIN_PRP: 0.108 + BIN_SBJP: 0.108 +PC6: + BIN_NOMZ: 0.368 + BIN_NN: -0.345 + BIN_DET: 0.344 + BIN_RB: -0.339 + BIN_ART: 0.326 + BIN_JJ: 0.324 + BIN_PRP: -0.262 + BIN_SBJP: -0.262 + BIN_FPP1: -0.144 + BIN_INDA: 0.128 +PC7: + BIN_JJ: 0.448 + BIN_X: -0.439 + BIN_QUOT: -0.375 + BIN_NOMZ: 0.312 + BIN_NN: 0.271 + BIN_RB: 0.231 + BIN_NUM: -0.179 + BIN_VPRT: 0.179 + BIN_INF: -0.169 + BIN_NNP: 0.164 +PC8: + BIN_RB: 0.623 + BIN_PRP: -0.415 + BIN_SBJP: -0.415 + BIN_FPP1: -0.240 + BIN_INF: 0.233 + BIN_JJ: 0.150 + BIN_AUXB: 0.147 + BIN_NOMZ: -0.143 + BIN_XX0: 0.110 + BIN_SPAU: 0.103 +PC9: + BIN_INF: 0.712 + BIN_VPRT: -0.427 + BIN_TO: 0.206 + BIN_X: -0.190 + BIN_AUXB: -0.179 + BIN_NUM: -0.173 + BIN_QUOT: -0.161 + BIN_NOMZ: 0.159 + BIN_CONJ: -0.122 + BIN_PRIV: 0.102 +PC10: + BIN_QUOT: 0.726 + BIN_JJ: 0.496 + BIN_CONT: 0.327 + BIN_X: -0.170 + BIN_NUM: -0.149 + BIN_INF: 0.134 + BIN_PASS: -0.080 + BIN_NOMZ: -0.074 + BIN_NN: 0.068 + BIN_AUXB: -0.060 +PC11: + BIN_X: 0.620 + BIN_JJ: 0.575 + BIN_NOMZ: -0.292 + BIN_QUOT: -0.288 + BIN_INF: 0.131 + BIN_PRP: 0.125 + BIN_SBJP: 0.125 + BIN_CONT: -0.123 + BIN_RB: -0.092 + BIN_FPP1: 0.085 +PC12: + BIN_VPRT: 0.529 + BIN_AUXB: 0.431 + BIN_RB: -0.404 + BIN_INF: 0.364 + BIN_TO: 0.187 + BIN_ART: -0.186 + BIN_PASS: 0.183 + BIN_VBD: -0.158 + BIN_BEMA: 0.128 + BIN_DEMP: 0.110 +PC13: + BIN_NUM: 0.554 + BIN_X: -0.544 + BIN_NOMZ: -0.509 + BIN_JJ: 0.160 + BIN_RB: -0.156 + BIN_QUOT: -0.124 + BIN_CONT: -0.109 + BIN_NN: -0.103 + BIN_VPRT: -0.081 + BIN_NNP: -0.073 +PC14: + BIN_NUM: 0.595 + BIN_NOMZ: 0.366 + BIN_VPRT: 0.348 + BIN_AUXB: -0.332 + BIN_VBD: -0.262 + BIN_PASS: -0.188 + BIN_CONT: 0.161 + BIN_INF: 0.157 + BIN_PGAS: -0.118 + BIN_CONJ: -0.118 +PC15: + BIN_AUXB: 0.484 + BIN_NUM: 0.450 + BIN_NOMZ: 0.315 + BIN_VPRT: -0.307 + BIN_VBD: 0.262 + BIN_PASS: 0.207 + BIN_BEMA: 0.194 + BIN_CONJ: 0.170 + BIN_PRIV: -0.162 + BIN_QUOT: 0.159 +PC16: + BIN_CONJ: 0.673 + BIN_PGAS: -0.355 + BIN_CCONJ: 0.324 + BIN_SCONJ: -0.247 + BIN_TO: -0.197 + BIN_VBD: -0.185 + BIN_WH: -0.164 + BIN_FPP1: -0.128 + BIN_PRIV: 0.113 + BIN_DEMP: -0.096 +PC17: + BIN_CCONJ: 0.471 + BIN_CONT: 0.462 + BIN_INDA: -0.260 + BIN_XX0: 0.221 + BIN_SCONJ: -0.216 + BIN_CONJ: -0.210 + BIN_SPAU: 0.199 + BIN_DET: 0.197 + BIN_FPP1: 0.196 + BIN_QUOT: -0.185 +PC18: + BIN_PGAS: 0.578 + BIN_CCONJ: 0.564 + BIN_CONT: -0.268 + BIN_PRIV: -0.235 + BIN_ANDC: 0.144 + BIN_PASS: -0.143 + BIN_QUOT: 0.138 + BIN_SPAU: -0.125 + BIN_VBD: -0.115 + BIN_NOMZ: 0.114 Top 10 PC1 values: PC1 PC2 ... priority closed_relevance 19873 125.128650 24.461032 ... Medium False @@ -63,4 +261,4 @@ Bottom 10 PC2 values: [10 rows x 26 columns] job finished, cleaning up -job pau at: Thu Sep 4 15:42:13 CDT 2025 +job pau at: Tue Sep 23 16:37:56 CDT 2025 diff --git a/p2/quest/python_scripts/biberplus_labeling.py b/p2/quest/python_scripts/biberplus_labeling.py index 347f544..d65a006 100644 --- a/p2/quest/python_scripts/biberplus_labeling.py +++ b/p2/quest/python_scripts/biberplus_labeling.py @@ -73,20 +73,28 @@ def biberplus_labeler(text): frequencies_df = frequencies_df.reset_index(drop=True) return frequencies_df +def make_text_for_analysis(row): + if row['comment_type'] == "task_description": + return f"{row['task_title']}./n/n{row['comment_text']}" + else: + return row['comment_text'] + + if __name__ == "__main__": #loading in the discussion data from the universal CSV first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") #formatting for the neurobiber model #docs = first_discussion_df["comment_text"].astype(str).tolist() - task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"] - docs = task_description_df['task_title'].astype(str).tolist() + first_discussion_df['text_for_analysis'] = first_discussion_df.apply(make_text_for_analysis, axis=1) + #task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"] + docs = first_discussion_df['text_for_analysis'].astype(str).tolist() #load model and run #model, tokenizer = load_model_and_tokenizer() preds_df = biberplus_labeler(docs) #new columns in the df for the predicted neurobiber items #preds_cols = [f"neurobiber_{i+1}" for i in range(96)] #preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index) - final_discussion_df = pd.concat([task_description_df, preds_df], axis=1) + final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1) #print(type(preds)) #assigning the preditions as a new column ''' @@ -109,6 +117,6 @@ if __name__ == "__main__": assert len(task_description_df) == len(final_discussion_df) final_discussion_df = final_discussion_df.drop(columns=["message"]) # if passing the prior asserts, let's write to a csv - final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/090725_biberplus_title_labels.csv", index=False) + final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/092325_biberplus_complete_labels.csv", index=False) print('biberplus labeling pau') diff --git a/p2/quest/python_scripts/neurobiber_PCA.py b/p2/quest/python_scripts/neurobiber_PCA.py index 1a08518..42eb6b1 100644 --- a/p2/quest/python_scripts/neurobiber_PCA.py +++ b/p2/quest/python_scripts/neurobiber_PCA.py @@ -6,6 +6,28 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns +import pickle + +# List of the 96 features that Neurobiber can predict +BIBER_FEATURES = [ + "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ", + "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT", + "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB", + "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP", + "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD", + "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2", + "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3", + "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB", + "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH", + "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP", + "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB", + "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ", + "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF", + "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP", + "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH", + "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X" +] + def format_df_data(df): @@ -34,11 +56,18 @@ if __name__ == "__main__": ''' pca = PCA(n_components=18) biber_vecs_pca = pca.fit_transform(biber_vecs) + with open('092325_pca.pkl', 'wb') as f: + pickle.dump(pca, f) selected_axis = "AuthorWMFAffil" component_variances = np.var(biber_vecs_pca, axis=0) print("Variance of each PCA component:", component_variances) + for i, component in enumerate(pca.components_): + print(f"PC{i+1}:") + indices = np.argsort(np.abs(component))[::-1] + for idx in indices[:10]: # Top 10 + print(f" {BIBER_FEATURES[idx]}: {component[idx]:.3f}") #first looking at comment_type le = LabelEncoder() @@ -55,7 +84,7 @@ if __name__ == "__main__": pc_dict['closed_relevance'] = biber_vec_df['closed_relevance'] plot_df = pd.DataFrame(pc_dict) - plot_df.to_csv("090425_description_PCA_df.csv", index=False) + plot_df.to_csv("092325_description_PCA_df.csv", index=False) print("Top 10 PC1 values:") print(plot_df.nlargest(10, "PC1")) @@ -92,5 +121,5 @@ if __name__ == "__main__": plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2) ''' g.fig.tight_layout() - g.savefig(f"description_{selected_axis}_090425_biber_pca_final.png", dpi=300) + g.savefig(f"description_{selected_axis}_092325_biber_pca_final.png", dpi=300) plt.show()