running PCA across both description and reply comment types
This commit is contained in:
parent
c40e87ff80
commit
bf4bc88083
341
p2/quest/102025_total_neurobiber-pca.log
Normal file
341
p2/quest/102025_total_neurobiber-pca.log
Normal file
@ -0,0 +1,341 @@
|
||||
starting the job at: Mon Oct 20 11:28:17 CDT 2025
|
||||
setting up the environment
|
||||
running the neurobiber labeling script
|
||||
0 [VisualEditor: Editing at top of page shows pa...
|
||||
1 [Change 86685 merged by jenkins-bot:\nFollow-u...
|
||||
2 [*** Bug 54785 has been marked as a duplicate ...
|
||||
3 [Change 86685 had a related patch set uploaded...
|
||||
4 [Don't release any new VE updates for three mo...
|
||||
...
|
||||
25023 [Could you attach a screenshot please?, Drag &...
|
||||
25024 [2 login-links on page but different behaviour...
|
||||
25025 [Sorry for not reply-ing., I did a test and co...
|
||||
25026 [SCREEN_NAME: Please answer.]
|
||||
25027 [I cannot replicate this., What's the name of ...
|
||||
Name: olmo_cleaned_sentences, Length: 25028, dtype: object
|
||||
[[13. ]
|
||||
[18. ]
|
||||
[ 6.5]
|
||||
...
|
||||
[ 5.5]
|
||||
[ 3. ]
|
||||
[ 6. ]]
|
||||
Number of PCs explaining 90% variance: 23
|
||||
Variance of each PCA component: [246.61008055 133.36698159 74.94326075 61.33534517 54.80370961
|
||||
35.82933802 29.27972058 23.85842265 19.90839023 17.25654085
|
||||
14.9325249 14.69222654 12.40988952 10.49363109 9.72451692
|
||||
8.38950194 8.11897062 7.49040269 7.26770791 6.45798815
|
||||
6.22392084 5.87412538 5.70894249]
|
||||
PC1:
|
||||
normalized_CAP: 0.666
|
||||
normalized_NNP: 0.595
|
||||
median_sentence_length: -0.315
|
||||
normalized_DET: -0.141
|
||||
normalized_PREP: -0.119
|
||||
normalized_PIN: -0.119
|
||||
normalized_ART: -0.090
|
||||
normalized_VPRT: -0.079
|
||||
normalized_RB: -0.075
|
||||
normalized_PRP: -0.069
|
||||
PC2:
|
||||
median_sentence_length: 0.930
|
||||
normalized_NNP: 0.313
|
||||
normalized_CAP: 0.093
|
||||
normalized_DET: -0.065
|
||||
normalized_VPRT: -0.065
|
||||
normalized_RB: -0.065
|
||||
normalized_AUXB: -0.050
|
||||
normalized_PRP: -0.040
|
||||
normalized_SBJP: -0.040
|
||||
normalized_X: 0.034
|
||||
PC3:
|
||||
normalized_NN: 0.759
|
||||
normalized_NNP: -0.288
|
||||
normalized_RB: -0.261
|
||||
normalized_PRP: -0.228
|
||||
normalized_SBJP: -0.228
|
||||
normalized_CAP: 0.210
|
||||
normalized_VPRT: -0.164
|
||||
normalized_FPP1: -0.115
|
||||
normalized_NUM: 0.102
|
||||
normalized_INF: -0.096
|
||||
PC4:
|
||||
normalized_CAP: 0.526
|
||||
normalized_PIN: 0.481
|
||||
normalized_PREP: 0.481
|
||||
normalized_NNP: -0.212
|
||||
normalized_PRP: 0.185
|
||||
normalized_SBJP: 0.185
|
||||
normalized_X: -0.141
|
||||
normalized_RB: 0.133
|
||||
normalized_CONJ: 0.129
|
||||
normalized_INF: 0.125
|
||||
PC5:
|
||||
normalized_NNP: 0.478
|
||||
normalized_PREP: 0.454
|
||||
normalized_PIN: 0.454
|
||||
normalized_CAP: -0.416
|
||||
normalized_RB: -0.267
|
||||
normalized_SBJP: -0.139
|
||||
normalized_PRP: -0.139
|
||||
median_sentence_length: -0.137
|
||||
normalized_VPRT: -0.108
|
||||
normalized_CONJ: 0.107
|
||||
PC6:
|
||||
normalized_DET: 0.615
|
||||
normalized_ART: 0.387
|
||||
normalized_NN: 0.289
|
||||
normalized_NNP: 0.273
|
||||
normalized_VPRT: 0.260
|
||||
normalized_X: -0.253
|
||||
normalized_AUXB: 0.209
|
||||
normalized_NUM: -0.172
|
||||
normalized_INDA: 0.159
|
||||
normalized_INF: -0.156
|
||||
PC7:
|
||||
normalized_NN: 0.477
|
||||
normalized_PRP: 0.451
|
||||
normalized_SBJP: 0.451
|
||||
normalized_NNP: 0.249
|
||||
normalized_FPP1: 0.231
|
||||
normalized_DET: -0.223
|
||||
normalized_CAP: -0.176
|
||||
normalized_AUXB: -0.169
|
||||
normalized_PASS: -0.134
|
||||
normalized_PIT: 0.125
|
||||
PC8:
|
||||
normalized_RB: 0.781
|
||||
normalized_NN: 0.240
|
||||
normalized_PRP: -0.207
|
||||
normalized_SBJP: -0.206
|
||||
normalized_DET: -0.185
|
||||
normalized_JJ: -0.171
|
||||
normalized_ART: -0.147
|
||||
normalized_NNP: 0.143
|
||||
normalized_TIME: 0.140
|
||||
normalized_X: -0.137
|
||||
PC9:
|
||||
normalized_JJ: 0.676
|
||||
normalized_INF: 0.326
|
||||
normalized_VPRT: -0.300
|
||||
normalized_NUM: -0.258
|
||||
normalized_PASS: -0.218
|
||||
normalized_AUXB: -0.216
|
||||
normalized_ART: 0.200
|
||||
sentence_count: 0.153
|
||||
normalized_RB: 0.136
|
||||
normalized_CONJ: -0.134
|
||||
PC10:
|
||||
normalized_INF: 0.647
|
||||
normalized_JJ: -0.513
|
||||
normalized_VPRT: -0.334
|
||||
normalized_DET: 0.260
|
||||
normalized_ART: 0.158
|
||||
normalized_TO: 0.127
|
||||
normalized_PRIV: 0.101
|
||||
normalized_NUM: 0.100
|
||||
normalized_AUXB: -0.094
|
||||
normalized_INDA: 0.077
|
||||
PC11:
|
||||
sentence_count: 0.554
|
||||
normalized_INF: -0.400
|
||||
normalized_AUXB: -0.331
|
||||
normalized_VPRT: -0.304
|
||||
normalized_JJ: -0.301
|
||||
normalized_ART: 0.208
|
||||
normalized_RB: 0.169
|
||||
normalized_PASS: -0.153
|
||||
normalized_BEMA: -0.129
|
||||
normalized_TO: -0.127
|
||||
PC12:
|
||||
sentence_count: 0.673
|
||||
normalized_X: -0.380
|
||||
normalized_RB: -0.235
|
||||
normalized_VBD: -0.235
|
||||
normalized_INF: 0.214
|
||||
normalized_VPRT: 0.188
|
||||
normalized_AUXB: 0.182
|
||||
normalized_DET: -0.153
|
||||
normalized_NUM: -0.150
|
||||
normalized_ART: -0.148
|
||||
PC13:
|
||||
normalized_X: 0.773
|
||||
normalized_VPRT: 0.284
|
||||
normalized_VBD: -0.259
|
||||
normalized_PUBV: -0.253
|
||||
sentence_count: 0.210
|
||||
normalized_NUM: -0.180
|
||||
normalized_CONJ: -0.144
|
||||
normalized_NOMZ: -0.116
|
||||
normalized_INF: 0.103
|
||||
normalized_UH: -0.096
|
||||
PC14:
|
||||
normalized_NUM: 0.789
|
||||
sentence_count: 0.222
|
||||
normalized_QUOT: -0.181
|
||||
normalized_JJ: 0.174
|
||||
normalized_VBD: -0.171
|
||||
normalized_VPRT: 0.165
|
||||
normalized_UH: -0.160
|
||||
normalized_PGAS: -0.156
|
||||
normalized_RB: 0.153
|
||||
normalized_INDA: 0.126
|
||||
PC15:
|
||||
normalized_VBD: 0.495
|
||||
normalized_AUXB: 0.374
|
||||
normalized_QUOT: -0.369
|
||||
normalized_CONT: -0.288
|
||||
sentence_count: 0.269
|
||||
normalized_PASS: 0.261
|
||||
normalized_VPRT: -0.244
|
||||
normalized_X: 0.174
|
||||
normalized_NOMZ: -0.154
|
||||
normalized_PUBV: 0.142
|
||||
PC16:
|
||||
normalized_QUOT: 0.594
|
||||
normalized_CONT: 0.441
|
||||
normalized_VBD: 0.400
|
||||
normalized_UH: -0.278
|
||||
normalized_NUM: 0.167
|
||||
normalized_PUBV: 0.164
|
||||
normalized_CONJ: -0.164
|
||||
normalized_NOMZ: -0.156
|
||||
normalized_STPR: 0.127
|
||||
normalized_AUXB: 0.121
|
||||
PC17:
|
||||
normalized_PUBV: 0.504
|
||||
normalized_CONJ: -0.473
|
||||
normalized_PGAS: 0.303
|
||||
normalized_VPRT: 0.255
|
||||
normalized_NOMZ: 0.245
|
||||
normalized_PASS: -0.219
|
||||
normalized_QUOT: -0.202
|
||||
normalized_CONT: -0.160
|
||||
normalized_AUXB: -0.128
|
||||
normalized_UH: -0.123
|
||||
PC18:
|
||||
normalized_CONJ: 0.612
|
||||
normalized_PUBV: 0.504
|
||||
normalized_UH: -0.308
|
||||
normalized_NUM: -0.210
|
||||
normalized_NOMZ: 0.166
|
||||
normalized_PGAS: -0.164
|
||||
normalized_ART: 0.156
|
||||
normalized_X: 0.155
|
||||
normalized_VPRT: 0.152
|
||||
normalized_DEMP: -0.116
|
||||
PC19:
|
||||
normalized_UH: 0.727
|
||||
normalized_PGAS: -0.434
|
||||
normalized_PUBV: 0.252
|
||||
normalized_NOMZ: -0.184
|
||||
normalized_VPRT: 0.182
|
||||
normalized_CCONJ: -0.171
|
||||
normalized_VBD: 0.142
|
||||
normalized_TIME: -0.092
|
||||
normalized_ART: 0.091
|
||||
normalized_INDA: 0.083
|
||||
PC20:
|
||||
normalized_ART: 0.432
|
||||
normalized_DET: -0.341
|
||||
normalized_DEMO: -0.286
|
||||
normalized_INDA: 0.280
|
||||
normalized_DEMP: -0.273
|
||||
normalized_CONJ: -0.271
|
||||
normalized_NOMZ: 0.216
|
||||
normalized_PIT: 0.211
|
||||
normalized_FPP1: -0.199
|
||||
normalized_AUXB: 0.199
|
||||
PC21:
|
||||
normalized_PGAS: 0.725
|
||||
normalized_UH: 0.307
|
||||
normalized_AUXB: 0.253
|
||||
normalized_NOMZ: -0.222
|
||||
normalized_CONJ: 0.211
|
||||
normalized_ART: 0.154
|
||||
normalized_BEMA: 0.143
|
||||
normalized_NUM: 0.110
|
||||
normalized_CCONJ: -0.106
|
||||
normalized_QUES: -0.106
|
||||
PC22:
|
||||
normalized_CCONJ: 0.625
|
||||
normalized_NOMZ: -0.330
|
||||
normalized_QUES: -0.312
|
||||
normalized_VPRT: 0.243
|
||||
normalized_AUXB: -0.234
|
||||
normalized_PRIV: 0.176
|
||||
normalized_BEMA: -0.175
|
||||
normalized_TIME: 0.174
|
||||
normalized_SCONJ: -0.162
|
||||
normalized_WH: -0.132
|
||||
PC23:
|
||||
normalized_PRIV: 0.614
|
||||
normalized_NOMZ: 0.395
|
||||
normalized_VBD: 0.269
|
||||
normalized_CCONJ: -0.257
|
||||
normalized_QUES: -0.246
|
||||
normalized_PUBV: -0.241
|
||||
normalized_FPP1: 0.217
|
||||
normalized_PIT: -0.154
|
||||
normalized_TO: -0.119
|
||||
normalized_CONJ: 0.117
|
||||
Top 10 PC1 values:
|
||||
PC1 PC2 ... AuthorPHID date_created
|
||||
23531 122.614493 26.508184 ... PHID-USER-arjqb24x4oae7awzpfp6 1424754141
|
||||
707 122.597764 26.498206 ... PHID-USER-pun3sjvg3cemjzbgyo2t 1363132183
|
||||
744 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353551242
|
||||
749 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353384355
|
||||
2243 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1356175107
|
||||
5921 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353366778
|
||||
5933 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353123761
|
||||
5935 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353386649
|
||||
10080 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1366298361
|
||||
10418 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1355363288
|
||||
|
||||
[10 rows x 33 columns]
|
||||
|
||||
Bottom 10 PC1 values:
|
||||
PC1 PC2 ... AuthorPHID date_created
|
||||
16080 -172.850818 487.621918 ... PHID-USER-zjzhrhmn36icnzbckqy4 1350678600
|
||||
24812 -145.955594 438.167891 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441072
|
||||
24813 -145.770728 438.255988 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441050
|
||||
13983 -97.131561 274.630845 ... PHID-USER-v7vgzvvcw7v2umf737ri 1380947348
|
||||
16510 -92.274843 294.480228 ... PHID-USER-izojihzr4ja3jsgzn5wv 1354470131
|
||||
161 -74.924301 196.826216 ... PHID-USER-hyfm4swq76s4j642w46x 1374730027
|
||||
22005 -66.493946 211.966534 ... PHID-USER-maceogqtxg4qfaefx7wd 1440633395
|
||||
24815 -66.161789 174.725549 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463439992
|
||||
6163 -65.995800 195.974595 ... PHID-USER-4bjsher5mqcoikeqnnec 1379611711
|
||||
20858 -58.834964 191.798984 ... PHID-USER-22bsa5u75jz3ci3wnplu 1441031208
|
||||
|
||||
[10 rows x 33 columns]
|
||||
Top 10 PC2 values:
|
||||
PC1 PC2 ... AuthorPHID date_created
|
||||
16080 -172.850818 487.621918 ... PHID-USER-zjzhrhmn36icnzbckqy4 1350678600
|
||||
24813 -145.770728 438.255988 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441050
|
||||
24812 -145.955594 438.167891 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441072
|
||||
16510 -92.274843 294.480228 ... PHID-USER-izojihzr4ja3jsgzn5wv 1354470131
|
||||
13983 -97.131561 274.630845 ... PHID-USER-v7vgzvvcw7v2umf737ri 1380947348
|
||||
22005 -66.493946 211.966534 ... PHID-USER-maceogqtxg4qfaefx7wd 1440633395
|
||||
161 -74.924301 196.826216 ... PHID-USER-hyfm4swq76s4j642w46x 1374730027
|
||||
6163 -65.995800 195.974595 ... PHID-USER-4bjsher5mqcoikeqnnec 1379611711
|
||||
20858 -58.834964 191.798984 ... PHID-USER-22bsa5u75jz3ci3wnplu 1441031208
|
||||
24815 -66.161789 174.725549 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463439992
|
||||
|
||||
[10 rows x 33 columns]
|
||||
|
||||
Bottom 10 PC2 values:
|
||||
PC1 PC2 ... AuthorPHID date_created
|
||||
17259 -11.502462 -20.654068 ... PHID-USER-6vzzsmi22zem6yttr6vp 1321220595
|
||||
24780 -7.649212 -18.480106 ... PHID-USER-lsveyqlsb4acoowxr5yj 1420344576
|
||||
22246 3.193427 -17.554603 ... PHID-USER-2nnm76h4ykalvvref2ye 1461480989
|
||||
20534 -7.911332 -16.886866 ... PHID-USER-lhtlnmkdbzlz6pbxaqdd 1443534834
|
||||
21315 -10.345515 -16.806925 ... PHID-USER-lzhljhpbm3qfphvqyill 1453975581
|
||||
5078 -14.656633 -16.709304 ... PHID-USER-wil4b5lylrvf3krixlkl 1377104818
|
||||
15192 -14.489305 -16.685012 ... PHID-USER-2nnm76h4ykalvvref2ye 1361930342
|
||||
20939 0.546084 -16.640121 ... PHID-USER-iy46yyjkoqm6q63ztld6 1436977570
|
||||
7055 -0.820605 -16.571093 ... PHID-USER-cfsvvgbtlqnbt2yokfjf 1377020909
|
||||
7471 -0.088132 -16.477148 ... PHID-USER-wkpnidxoctuhawexig5p 1386166246
|
||||
|
||||
[10 rows x 33 columns]
|
||||
job finished, cleaning up
|
||||
job pau at: Mon Oct 20 11:28:52 CDT 2025
|
||||
BIN
p2/quest/102025_total_pca.pkl
Normal file
BIN
p2/quest/102025_total_pca.pkl
Normal file
Binary file not shown.
146014
p2/quest/102025_total_pca_df.csv
Normal file
146014
p2/quest/102025_total_pca_df.csv
Normal file
File diff suppressed because one or more lines are too long
@ -86,7 +86,7 @@ def format_df_data(df):
|
||||
|
||||
if __name__ == "__main__":
|
||||
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", low_memory=False)
|
||||
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
|
||||
#biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
|
||||
#biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
|
||||
#biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
|
||||
biber_vecs = format_df_data(biber_vec_df)
|
||||
@ -103,7 +103,7 @@ if __name__ == "__main__":
|
||||
|
||||
pca = PCA(n_components=argmax_components)
|
||||
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
||||
with open('101325_subcomment_pca.pkl', 'wb') as f:
|
||||
with open('102025_total_pca.pkl', 'wb') as f:
|
||||
pickle.dump(pca, f)
|
||||
selected_axis = "AuthorWMFAffil"
|
||||
|
||||
@ -135,7 +135,7 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
plot_df = pd.DataFrame(pc_dict)
|
||||
plot_df.to_csv("101325_subcomment_PCA_df.csv", index=False)
|
||||
plot_df.to_csv("102025_total_pca_df.csv", index=False)
|
||||
|
||||
print("Top 10 PC1 values:")
|
||||
print(plot_df.nlargest(10, "PC1"))
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --cpus-per-task=4
|
||||
#SBATCH --job-name=neurobiber-pca
|
||||
#SBATCH --output=101325_subcomment_neurobiber-pca.log
|
||||
#SBATCH --output=102025_total_neurobiber-pca.log
|
||||
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user