1
0

removing erroneous PCA df, going to re-run

This commit is contained in:
Matthew Gaughan 2025-10-20 10:31:54 -07:00
parent bf4bc88083
commit 2e8b85d3e9
5 changed files with 28 additions and 146358 deletions

View File

@ -0,0 +1,17 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:57743 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: anx8V7R1X2rfcwUV20H/
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 30181212

View File

@ -1,341 +0,0 @@
starting the job at: Mon Oct 20 11:28:17 CDT 2025
setting up the environment
running the neurobiber labeling script
0 [VisualEditor: Editing at top of page shows pa...
1 [Change 86685 merged by jenkins-bot:\nFollow-u...
2 [*** Bug 54785 has been marked as a duplicate ...
3 [Change 86685 had a related patch set uploaded...
4 [Don't release any new VE updates for three mo...
...
25023 [Could you attach a screenshot please?, Drag &...
25024 [2 login-links on page but different behaviour...
25025 [Sorry for not reply-ing., I did a test and co...
25026 [SCREEN_NAME: Please answer.]
25027 [I cannot replicate this., What's the name of ...
Name: olmo_cleaned_sentences, Length: 25028, dtype: object
[[13. ]
[18. ]
[ 6.5]
...
[ 5.5]
[ 3. ]
[ 6. ]]
Number of PCs explaining 90% variance: 23
Variance of each PCA component: [246.61008055 133.36698159 74.94326075 61.33534517 54.80370961
35.82933802 29.27972058 23.85842265 19.90839023 17.25654085
14.9325249 14.69222654 12.40988952 10.49363109 9.72451692
8.38950194 8.11897062 7.49040269 7.26770791 6.45798815
6.22392084 5.87412538 5.70894249]
PC1:
normalized_CAP: 0.666
normalized_NNP: 0.595
median_sentence_length: -0.315
normalized_DET: -0.141
normalized_PREP: -0.119
normalized_PIN: -0.119
normalized_ART: -0.090
normalized_VPRT: -0.079
normalized_RB: -0.075
normalized_PRP: -0.069
PC2:
median_sentence_length: 0.930
normalized_NNP: 0.313
normalized_CAP: 0.093
normalized_DET: -0.065
normalized_VPRT: -0.065
normalized_RB: -0.065
normalized_AUXB: -0.050
normalized_PRP: -0.040
normalized_SBJP: -0.040
normalized_X: 0.034
PC3:
normalized_NN: 0.759
normalized_NNP: -0.288
normalized_RB: -0.261
normalized_PRP: -0.228
normalized_SBJP: -0.228
normalized_CAP: 0.210
normalized_VPRT: -0.164
normalized_FPP1: -0.115
normalized_NUM: 0.102
normalized_INF: -0.096
PC4:
normalized_CAP: 0.526
normalized_PIN: 0.481
normalized_PREP: 0.481
normalized_NNP: -0.212
normalized_PRP: 0.185
normalized_SBJP: 0.185
normalized_X: -0.141
normalized_RB: 0.133
normalized_CONJ: 0.129
normalized_INF: 0.125
PC5:
normalized_NNP: 0.478
normalized_PREP: 0.454
normalized_PIN: 0.454
normalized_CAP: -0.416
normalized_RB: -0.267
normalized_SBJP: -0.139
normalized_PRP: -0.139
median_sentence_length: -0.137
normalized_VPRT: -0.108
normalized_CONJ: 0.107
PC6:
normalized_DET: 0.615
normalized_ART: 0.387
normalized_NN: 0.289
normalized_NNP: 0.273
normalized_VPRT: 0.260
normalized_X: -0.253
normalized_AUXB: 0.209
normalized_NUM: -0.172
normalized_INDA: 0.159
normalized_INF: -0.156
PC7:
normalized_NN: 0.477
normalized_PRP: 0.451
normalized_SBJP: 0.451
normalized_NNP: 0.249
normalized_FPP1: 0.231
normalized_DET: -0.223
normalized_CAP: -0.176
normalized_AUXB: -0.169
normalized_PASS: -0.134
normalized_PIT: 0.125
PC8:
normalized_RB: 0.781
normalized_NN: 0.240
normalized_PRP: -0.207
normalized_SBJP: -0.206
normalized_DET: -0.185
normalized_JJ: -0.171
normalized_ART: -0.147
normalized_NNP: 0.143
normalized_TIME: 0.140
normalized_X: -0.137
PC9:
normalized_JJ: 0.676
normalized_INF: 0.326
normalized_VPRT: -0.300
normalized_NUM: -0.258
normalized_PASS: -0.218
normalized_AUXB: -0.216
normalized_ART: 0.200
sentence_count: 0.153
normalized_RB: 0.136
normalized_CONJ: -0.134
PC10:
normalized_INF: 0.647
normalized_JJ: -0.513
normalized_VPRT: -0.334
normalized_DET: 0.260
normalized_ART: 0.158
normalized_TO: 0.127
normalized_PRIV: 0.101
normalized_NUM: 0.100
normalized_AUXB: -0.094
normalized_INDA: 0.077
PC11:
sentence_count: 0.554
normalized_INF: -0.400
normalized_AUXB: -0.331
normalized_VPRT: -0.304
normalized_JJ: -0.301
normalized_ART: 0.208
normalized_RB: 0.169
normalized_PASS: -0.153
normalized_BEMA: -0.129
normalized_TO: -0.127
PC12:
sentence_count: 0.673
normalized_X: -0.380
normalized_RB: -0.235
normalized_VBD: -0.235
normalized_INF: 0.214
normalized_VPRT: 0.188
normalized_AUXB: 0.182
normalized_DET: -0.153
normalized_NUM: -0.150
normalized_ART: -0.148
PC13:
normalized_X: 0.773
normalized_VPRT: 0.284
normalized_VBD: -0.259
normalized_PUBV: -0.253
sentence_count: 0.210
normalized_NUM: -0.180
normalized_CONJ: -0.144
normalized_NOMZ: -0.116
normalized_INF: 0.103
normalized_UH: -0.096
PC14:
normalized_NUM: 0.789
sentence_count: 0.222
normalized_QUOT: -0.181
normalized_JJ: 0.174
normalized_VBD: -0.171
normalized_VPRT: 0.165
normalized_UH: -0.160
normalized_PGAS: -0.156
normalized_RB: 0.153
normalized_INDA: 0.126
PC15:
normalized_VBD: 0.495
normalized_AUXB: 0.374
normalized_QUOT: -0.369
normalized_CONT: -0.288
sentence_count: 0.269
normalized_PASS: 0.261
normalized_VPRT: -0.244
normalized_X: 0.174
normalized_NOMZ: -0.154
normalized_PUBV: 0.142
PC16:
normalized_QUOT: 0.594
normalized_CONT: 0.441
normalized_VBD: 0.400
normalized_UH: -0.278
normalized_NUM: 0.167
normalized_PUBV: 0.164
normalized_CONJ: -0.164
normalized_NOMZ: -0.156
normalized_STPR: 0.127
normalized_AUXB: 0.121
PC17:
normalized_PUBV: 0.504
normalized_CONJ: -0.473
normalized_PGAS: 0.303
normalized_VPRT: 0.255
normalized_NOMZ: 0.245
normalized_PASS: -0.219
normalized_QUOT: -0.202
normalized_CONT: -0.160
normalized_AUXB: -0.128
normalized_UH: -0.123
PC18:
normalized_CONJ: 0.612
normalized_PUBV: 0.504
normalized_UH: -0.308
normalized_NUM: -0.210
normalized_NOMZ: 0.166
normalized_PGAS: -0.164
normalized_ART: 0.156
normalized_X: 0.155
normalized_VPRT: 0.152
normalized_DEMP: -0.116
PC19:
normalized_UH: 0.727
normalized_PGAS: -0.434
normalized_PUBV: 0.252
normalized_NOMZ: -0.184
normalized_VPRT: 0.182
normalized_CCONJ: -0.171
normalized_VBD: 0.142
normalized_TIME: -0.092
normalized_ART: 0.091
normalized_INDA: 0.083
PC20:
normalized_ART: 0.432
normalized_DET: -0.341
normalized_DEMO: -0.286
normalized_INDA: 0.280
normalized_DEMP: -0.273
normalized_CONJ: -0.271
normalized_NOMZ: 0.216
normalized_PIT: 0.211
normalized_FPP1: -0.199
normalized_AUXB: 0.199
PC21:
normalized_PGAS: 0.725
normalized_UH: 0.307
normalized_AUXB: 0.253
normalized_NOMZ: -0.222
normalized_CONJ: 0.211
normalized_ART: 0.154
normalized_BEMA: 0.143
normalized_NUM: 0.110
normalized_CCONJ: -0.106
normalized_QUES: -0.106
PC22:
normalized_CCONJ: 0.625
normalized_NOMZ: -0.330
normalized_QUES: -0.312
normalized_VPRT: 0.243
normalized_AUXB: -0.234
normalized_PRIV: 0.176
normalized_BEMA: -0.175
normalized_TIME: 0.174
normalized_SCONJ: -0.162
normalized_WH: -0.132
PC23:
normalized_PRIV: 0.614
normalized_NOMZ: 0.395
normalized_VBD: 0.269
normalized_CCONJ: -0.257
normalized_QUES: -0.246
normalized_PUBV: -0.241
normalized_FPP1: 0.217
normalized_PIT: -0.154
normalized_TO: -0.119
normalized_CONJ: 0.117
Top 10 PC1 values:
PC1 PC2 ... AuthorPHID date_created
23531 122.614493 26.508184 ... PHID-USER-arjqb24x4oae7awzpfp6 1424754141
707 122.597764 26.498206 ... PHID-USER-pun3sjvg3cemjzbgyo2t 1363132183
744 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353551242
749 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353384355
2243 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1356175107
5921 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353366778
5933 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353123761
5935 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353386649
10080 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1366298361
10418 122.597764 26.498206 ... PHID-USER-fovtl67ew4l4cc3oeypc 1355363288
[10 rows x 33 columns]
Bottom 10 PC1 values:
PC1 PC2 ... AuthorPHID date_created
16080 -172.850818 487.621918 ... PHID-USER-zjzhrhmn36icnzbckqy4 1350678600
24812 -145.955594 438.167891 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441072
24813 -145.770728 438.255988 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441050
13983 -97.131561 274.630845 ... PHID-USER-v7vgzvvcw7v2umf737ri 1380947348
16510 -92.274843 294.480228 ... PHID-USER-izojihzr4ja3jsgzn5wv 1354470131
161 -74.924301 196.826216 ... PHID-USER-hyfm4swq76s4j642w46x 1374730027
22005 -66.493946 211.966534 ... PHID-USER-maceogqtxg4qfaefx7wd 1440633395
24815 -66.161789 174.725549 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463439992
6163 -65.995800 195.974595 ... PHID-USER-4bjsher5mqcoikeqnnec 1379611711
20858 -58.834964 191.798984 ... PHID-USER-22bsa5u75jz3ci3wnplu 1441031208
[10 rows x 33 columns]
Top 10 PC2 values:
PC1 PC2 ... AuthorPHID date_created
16080 -172.850818 487.621918 ... PHID-USER-zjzhrhmn36icnzbckqy4 1350678600
24813 -145.770728 438.255988 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441050
24812 -145.955594 438.167891 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441072
16510 -92.274843 294.480228 ... PHID-USER-izojihzr4ja3jsgzn5wv 1354470131
13983 -97.131561 274.630845 ... PHID-USER-v7vgzvvcw7v2umf737ri 1380947348
22005 -66.493946 211.966534 ... PHID-USER-maceogqtxg4qfaefx7wd 1440633395
161 -74.924301 196.826216 ... PHID-USER-hyfm4swq76s4j642w46x 1374730027
6163 -65.995800 195.974595 ... PHID-USER-4bjsher5mqcoikeqnnec 1379611711
20858 -58.834964 191.798984 ... PHID-USER-22bsa5u75jz3ci3wnplu 1441031208
24815 -66.161789 174.725549 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463439992
[10 rows x 33 columns]
Bottom 10 PC2 values:
PC1 PC2 ... AuthorPHID date_created
17259 -11.502462 -20.654068 ... PHID-USER-6vzzsmi22zem6yttr6vp 1321220595
24780 -7.649212 -18.480106 ... PHID-USER-lsveyqlsb4acoowxr5yj 1420344576
22246 3.193427 -17.554603 ... PHID-USER-2nnm76h4ykalvvref2ye 1461480989
20534 -7.911332 -16.886866 ... PHID-USER-lhtlnmkdbzlz6pbxaqdd 1443534834
21315 -10.345515 -16.806925 ... PHID-USER-lzhljhpbm3qfphvqyill 1453975581
5078 -14.656633 -16.709304 ... PHID-USER-wil4b5lylrvf3krixlkl 1377104818
15192 -14.489305 -16.685012 ... PHID-USER-2nnm76h4ykalvvref2ye 1361930342
20939 0.546084 -16.640121 ... PHID-USER-iy46yyjkoqm6q63ztld6 1436977570
7055 -0.820605 -16.571093 ... PHID-USER-cfsvvgbtlqnbt2yokfjf 1377020909
7471 -0.088132 -16.477148 ... PHID-USER-wkpnidxoctuhawexig5p 1386166246
[10 rows x 33 columns]
job finished, cleaning up
job pau at: Mon Oct 20 11:28:52 CDT 2025

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -6,6 +6,9 @@ neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , head
neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv , header = TRUE)
@ -21,6 +24,11 @@ subcomment_joined <- main_df |>
right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
filter(comment_text != "nan") #TODO: look at this more in depth
total_joined <- main_df |>
right_join(pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
filter(comment_text != "nan") #TODO: look at this more in depth
preprocess_comment <- function(message) {
library(stringr)
comment_text <- message
@ -144,10 +152,10 @@ subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
description_joined <- description_joined %>%
mutate(priority = factor(priority.y, levels = priority_order))
ggplot(description_joined, aes(
x = as.factor(priority), # x-axis grouping
ggplot(total_joined, aes(
x = PC1, # x-axis grouping
y = PC2,
fill = AuthorPHID
fill = comment_type
)) +
ylim(-20, 20) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +