From d513e245b59b73a0c709b7dda5e8fcd7e9913d96 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 2 Dec 2025 14:20:38 -0800 Subject: [PATCH] updating some plots for results section, also saving model to file --- dsl/120225_logit_dsl.RDS | Bin 0 -> 7734 bytes dsl/dsl.R | 41 +++++++++++++- dsl/final_bivariate.R | 3 +- p2/quest/neurobiber_PCA_analysis.R | 86 +++++++++++++++++++++++++++++ 4 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 dsl/120225_logit_dsl.RDS diff --git a/dsl/120225_logit_dsl.RDS b/dsl/120225_logit_dsl.RDS new file mode 100644 index 0000000000000000000000000000000000000000..ffdcccf014cbb4288bbf7af6d4d471116b04bcf3 GIT binary patch literal 7734 zcmdscXHZjX*Y@$Kpdv&?L25)$q$?c)iGq{}h@f-`Aieia$gvvq zgx(=Q=%GV^BqV%M&-2ZEGv7P!|7ZW%Gi&aBt##jf?R8yiZD7Q?zkXk*H^$DJMowcP z+r?&3hYIYY@YipLv<6VZMiB|{?g9auyh4XWGy5ujqpoH?K9egxbT(S&I+OG(lTthM zPcpqkz4y>a>=r5@qbv$yfHWIU zb|h&kbC1Ae9*cI4)-(;I9>>X{hBlVBoCtd|$D=SC(I6EO5i$Mfiol+$MV~#Z7B7T- zueMFo>;uijB*(z8j%-6VCPCHk!_E9Uh_4#7{&3L%gAHXVI}jTb;(nvQPSAMj9Z}Qi zetp32RqUw=mbAIN($z50ApKj9KWi(aPBC*V*FWkfe2S}5$l%N1LAUugB|9@6dP(&Ux284G{4`uT}_E_;hHAlxugUrhYz_zS)w zBb~JBu6{xMU$s3MXZ^eJ2X+%Cep=|BGU+mq(_5x3(O^Z37oyF3=Y0m*IA3H~k4ph0 zFDd~T2e$ZaM3l4-T(=}e@3I?2O<8@snx?U5$TiJu78yhQObT_*@2>|7dj-#a7K(Ti#W^=^M zFGb$sw5Sz?v3fd)9nzJ`;z`|w9q+FmbtfOxC=I|e$|Fs_$L{#t4tQScW%O-D<+6-X zO|xIzgd)=e4-PsenT6zeA@YQr^-R>aG`6jm(PH|Uub-h7{c$>JSstLU5c!m_!P9vK z9Fy&z;;bULVBZ6tJ1kUHG)?&Y&=#30)zDY5D7-7%6f~fmB)2dxzfXNEkF*Xx&*yf& zDA4~-wP!L%xDb=MbW~{zXcP!9NI;(#+g#ta{&7Taui{S~{9?_J=7l+&K`N*N+Y?JO ztgo{~RnF_WITIXSy18~wInM{3TfSJmBepFY1_V#MFY3oJ3cAY6y6sl?PmzW(`3zjZ z&Y*~oe@Ba&m9xJK{E2N0EQK>TuX~VO5;jb&uaE@ywW1Rl%X>X`mg*kIeYogew5HCk@NpyHgix%a&A1X%LCT-u&+<+4656e?BIb6ZojUm~1O zq1*AQ=T6zZuKFl_#^x#ei34qa{QT*qmoGVWk$dV*{`~~ry#QpS8&fAJv|3)UHgW09 zTx~1hV{PT7T+sAD8PniBrRm4l^`e_Sn6>2-kQR4yFQ6WeO(0rp93`6y90JF@1Ngc>?(WIRZ7WWm?!Gcb5R_6Y`-y*fQ2OdLX#(i<;azpI|_%)0@Z0OWwT z`U83JVI9G@<968M8WSnfFzy7j(wDo_TaOluPDNLZhhGJJDH%&_hUAKJ!}lLyJ$^ip zcNnbSSB7&&AJ!IC%H138*K2Au8`$cxKj{y-K|n!30era%!}z?e(3_o(ozO z`rCb^h5~#VS~f&qZ$3zgU$q_FnRuxJv~?&%cVfRG%OGWm^D*Bo@VuA+A2}Plb=z~{$g_v!IQ5p5E2k74kZ$@)tWDk|v zX@do4j9K={V3$k#OJ~ZzZC%BGVn_nKtnb0U9;o}Wj!yBp+IzAki``DE!TVuLu7OVk zZ!T`5Dr#)<6A-!zf|WoCCB&)n7EqC272(46rHDrG$x^K1$-~s-NsXm=;XtLqm}QX0 z0lD3PZ~Mgb=R)OfUHR%R!8*R){K$>{C>tAz#qQv+(FA2m3kZG)+Aa5^EXpjNY_aSi zr*~grQxQ$8`DoC|!ALp!!ATy}fUQCt~8C$l+%hbEs}$WqiL1Kgq8Smm6t!01W1 zb+i0~7`I|Q=nc=qZC?Is)SVNlZ3@}(-5I6+m>K=$?oeQ}6UB+Wag`NnNDjRi5G-`* zvRWa}2o}@sXYdeqe1B%0&hFY&o9j;li^VZj$tZ;_6=VpDk`q^P8MfHSw#D>a+{{Mf zwCcMnBK*T)iC&l=KqIA#_u72P=xdGCG;U{HO>YCq=eEaz>3F=^e?=El(mV_ms79IBUg z;tV+Kxr?6+fcBc4E_NSr!3~GEnsabh$*DXdVV^yl%meL^TQ4HwbaA>;%uiGn1QP~E zn43i3PO~0+{m@coeBi}X&Hpld38*3Jga)@ep(s#?O%#mNgf znQia1w~4+IWeNS65dR(Ni8XuM-YD@14Cw71~8TZY1>97;}^?t1v6ip5NrCMdm zvL8~9qPK12pjQ^pnSgXGf?UC!NA}#HN;Q4*yEHNN37ZYy6kfSd$F76KbUYQBC30-_ zqvWVCZYlh{KGUmY^`Q%IZtF81f2Hb3S#~t6DjRWm^NsmhL*DhKS>8CM)t{*AKt;P!Q*lM1#%TuEL(O#2Kya($wt3`T(OW z);yRINXd3y>F~>3QgWNDh{~Xp@)R4IIY}g40()sgbgkBNX)^YU@5fb=T=-Ho7(Jpv zA@vh;aM=L`E3Q0~h==ll)U0wsg^ZdGEP$TF0c3EzyG^lT;u6#u>;G82yA;$ntuXzq zthBhVDw63u9i=|&r{d$kAqouFjF2LtzMeUh`}p_~1~Z5}(mU962E@Odww0lh{f&{% zBW&r`Y#sbf@JF`92tM}yR0%9l^_{ezY8m^NtMKbhd<_TBxkMc$?tAnoGVGVNF&=Jn zDwdt7L);0_wNMippf#<$o1raE+Sh7zzW77)KRA2biS~0H-GUa0*Bd>z$X^z|58!us z%K(?WW~13$;f-uO+vPY60ZEs|9ncu(SM@~&X5!VB;-h1hGV4a)SZe}x-_)Y=lD}Vk z+eS3tu6NmBjVwNVtfe?($H#6CZz)uMdiHqN($aeCsC|m6L`pvZZqOSet6^VgHM}z_ zW+8JF#FCELhDFtJPdXngUlV#}2K&4=a}AMTAnEudt)i^t*lCO70tf3LC2~`&8jGUx z{yL%J--LytWzA}7TWD4OXW1q@t5a%5A5y|SzxfesNGRddg) zRfYr3Xm^!#8;LMD>yU*z@{nOA7duDTWjVE^U-OSnK zNzj^6<}#c=2*yl@Gw#mJqt_JiXsepZGl{#XqY`m0sym?IeitgB{ow3uD z&yuR2gHKi-O9B+x=0~LEKVL!IJ$LC%x}+LwIZ=D={K?nDM(Lw{VDp9aaIvDakeW;A zf=ArPeF(-fc5XZTX5PyPfOPYRiEo_AIogQonGkV7Z(a8;*wR1P#%kvpp|AAIGNKD| zz>t{S4b7#DD$duBSY`x=fa5P{A4i2&KG#U!+NjY@Kg%M|*>v|~!4iBW7hUVNQ}eFw zoF?yaJ(3jqwJZ7v_i5lV^H2Tx`IexKt=By*$x|$#GaL9WK|^AJ-gJBxNOdPW;>fYU zB66d&MEj{BG0Jv?C!)7f5WF?4CIwPn{`D3a z^WX-DMz5iyO`6%cLhwD*8@&tQgx#5YVR|kBZLq*%^#RRw-6Vp&pLLzxDQBm!A;dUZ!;E9y!wVL5( zml9BjZpP|J^;0?nF>gu-jRiacZc%NxaGq55>fFPn4hfcZF0FwPJBrdpCYou z)!LWchKeq@Az6y{m@)R2_n@?EG5BTs?WI9>ON4&}kJc%)+|IYr)sW#n@b!|8OR+y#s*J0yi;tM(dnQAJ^GBx~Y(dDAv}c1^kP)JOZ|aOK!Cv>v>puLq9(V2JHJT zCaGLYHnDoQXsgnFEB0s>9uJMLD#rnEq(*xM;tfGZ+a;NDZMKm7bflc$Yr0WHHBg;a zV&+C-dimLaJ*}f$0NI`|T#?++;RA502lEDU&RL;M`EqlB%;5Ka$Lk>-LEea`PTC*m z+z(rJRGoGbpXzoKne)9Uk4Z6jdR?UHZ-oqPZ(p2dZ~e$V<9ILWOS)?AZDk$m>r<>4 z*s$twdoFMIl$HD2)}o#_;aB!=dOAZ7J=p%g6?$$|NVvCc#y+gpp@A;33>8`|G&Fpy zq2xY8NzL_%n$%b@-^C1FRqOko3Utq2DOX*L+Um4!(?J)04v-Q>G42t2a0}MOjscf^ zBJ)pUR;Ws38a0g42nI?E9=8-98M6c(dWJ-HR~IVD;B^%k=ou632_0OQ$oVVy5HmEw zlgu`5{w0K;qW0@`7>ilcUWX@*hr5`T#C}ZwR6QFs1z6{<2qS&n81+8kBePAL+oK1m z*?;);S33VvQ_x}%i;%RK#9#C+58N+tp7}4o9cz2Nx4gQ@rdkti`GGbqdf;mxr)=Om zBp%sF*J@v?Y`2|0U(x>Y$-t!s?N+@jJDJjyeyfOHSq*+`?TJEl&MKUsn}+@knC~$L z3_6~iJ3APkEaz(dMBQqcUqW8<>l^Sei-24A6VvfTnKm26oxH&a(xmN$SWN5Js(ep6 z-1f+MFZpplmFLz?rHvL+Flk@Y!U-Zu5&*YvY;!PlWv9A}VcR#UH;<_Y96s>rOTmgi z$I5aiZss5yR(NgYUd3qM74>yfVA$%&_z?q%dpnAWsgbzbt14iYDx{EdtHllp1jER_ za?uIJrNLZ-4EzGz=>x-g=Wg{dKb9ZnXq3sv(NxWIbq4rUEYYkRTwX*?0b~XEn5>+pYk8hURW4xQ>@sXqZ3D7o;q6FJ0AB1|4G)AvGQ+z=F78Tnd9K_ib`9En=&phlV#ETxs7 zUxT@2fAQv3zOD0J+uqyD2Au75N&Jt?7{$K2Qcq)=)+iSn9y&h?7+gN>d-80kV;v^T z4XJx3mInRGtmZ74K$m`-;@U|theOWvlS;~2aqN{L44uKfe7J~P0k>AZC&cM&Xpi5t zy489kBj73LWRc`!4H|$YZs5RwkXD~MpE=Y6tWnapGD0^EFbA{~szWpSx?VYclmNO4x&uqwlvVNAI{rW&G zYKNP%@71~;w>N@^!i^#PmhA+BXKV?QB?Y5x9V`QdiFqINoR87E=jlH=QnPGOB$!Q` z@vfhF5g_ZhTc}zlx@mo~a3E2S?xRJr3?;qDBy*0>! zOO-n5{jptp>E3>uJY=8i;}8>?Y+o1ahN9hYr1EBZr zFZQl*lIFku;KFqk0=&_;$#x3k7}q$8johlA+W?{o3bB4qQqWklx%8)4T@qhRCzv%JI3rmOv9)}S}UCY67ALpQ|{MJFz`iTo_tsQ7+1a{FtR_U|<>_|#u z2nlc<98qCSl#JrYu;a?kV=>kBChMUJ#&`qpXO<(8Fmy*PByg141<43nFV)A*rE8oV zG;0v++`z_^Ib>H`Za;BQ@WbPiVY1G^oQanbkWz=&{Ne>prBAav|w`gXm3bqNs~!NcT>a99Jg-eWApUx#bU#lR2!>Qra#@= zkr0}%Kz>3q<@~JM|7X&PREL_K zE$^hK2@bmd$c_aX1U8<5DFngw$Mo5f25dre_In*opBx*}3{HD3En>0~0mF)P0+b|JTImMFZabiyr!G9W6O#F*Hx+X#l2>fPen(q%AlXzs-xTE=VH zgh^+26m(IGzP$TRn_RBS!mSw8= @@ -81,7 +81,7 @@ dev_model <- dsl( data=dsl_df ) summary(dev_model) - +saveRDS(dev_model, "120225_logit_dsl.RDS") library(broom) library(dplyr) @@ -101,6 +101,43 @@ tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALS return(out) } coef_df <- tidy.dsl(dev_model) +coef_df <- coef_df |> + mutate( + term = recode(term, + "week_index" = "Weeks from deployment", + "(Intercept)" = "Intercept", + "n_comments_before" = "# of comments prior to resolution", + "median_PC4_adac" = "Median Author PC4 Pre-resolution", + "median_PC3_adac" = "Median Author PC3 Pre-resolution", + "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)", + "median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)", + "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'", + "human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'", + "human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'", + "as.factor(source)c3" = "HTTP-deprecation (factor)", + "as.factor(source)c2" = "HTTPS-as-default (factor)", + "as.factor(isAuthorWMF)TRUE" = "WMF-affiliate Author (factor)", + "as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliate Author:HTTPS-as-default", + "as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliate Author:HTTP-deprecation", + ), + term = factor(term, levels = rev(c( + "Intercept", + "% of sentences discussing 'Existent Problems'", + "% of sentences discussing 'Solutions'", + "% of sentences discussing 'Record Keeping'", + "Median Author PC4 Pre-resolution", + "Median Author PC3 Pre-resolution", + "# of comments prior to resolution", + "Median # of Code Reviewers (Gerrit)", + "Median LoC Changed (Gerrit)", + "Weeks from deployment", + "HTTPS-as-default (factor)", + "HTTP-deprecation (factor)", + "WMF-affiliate Author (factor)", + "WMF-affiliate Author:HTTPS-as-default", + "WMF-affiliate Author:HTTP-deprecation" + ))) + ) ggplot(coef_df, aes(x = estimate, y = term)) + geom_point(size = 1) + geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) + diff --git a/dsl/final_bivariate.R b/dsl/final_bivariate.R index 3df73e4..66f07a1 100644 --- a/dsl/final_bivariate.R +++ b/dsl/final_bivariate.R @@ -93,7 +93,8 @@ ggplot( geom_point() + geom_smooth() + scale_color_viridis_d() + - theme_minimal() + theme_minimal() + + labs(x = "Weeks from Release", y = "% of sentences machine-tagged as'Existent Problems'", title = "Proportion of 'Existent Problems' tags over time") dsl_df <- dsl_df |> mutate(priority = factor(priority, diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R index 1689783..98824a3 100644 --- a/p2/quest/neurobiber_PCA_analysis.R +++ b/p2/quest/neurobiber_PCA_analysis.R @@ -12,6 +12,92 @@ library(dplyr) main_csv <- "~/analysis_data/110925_unified.csv" main_df <- read.csv(main_csv , header = TRUE) +main_df |> + ggplot( + aes( + x = PC4, + y = PC3, + fill = comment_type + ) + ) + + facet_grid(~source, scales="fixed", + labeller = as_labeller(c( + "c1" = "VisualEditor (c1)", + "c2" = "HTTPS-as-default (c2)", + "c3" = "HTTP-deprecation (c3)" + ))) + + geom_point(shape = 21, alpha=0.3, size=2) + + xlim(-50, 50) + + ylim(-50, 50) + + scale_fill_viridis_d( + option = "magma", + name = "Comment type", + labels = c("Task Description", "Reply"))+ + theme_minimal() + + theme(legend.position = "top") + + labs( + title = "PCs for Task Comments by comment type and case", + x = "Casual v. Formal Updates (PC3)", + y = "Technical-matter v. Procedural Commentary (PC4)", + ) + +main_df |> + filter(ADAC=="1") |> + ggplot( + aes( + x = PC4, + y = PC3, + fill = isAuthorWMF + ) + ) + + facet_grid(comment_type~source, + labeller = as_labeller(c( + "c1" = "VisualEditor (c1)", + "c2" = "HTTPS-as-default (c2)", + "c3" = "HTTP-deprecation (c3)", + "task_description" = "Task Description", + "task_subcomment" = "Follow-up Reply" + ))) + + geom_point(shape = 21, alpha=0.3, size=2) + + scale_fill_viridis_d( + name = "Comment Author Affiliation", + labels = c("Nonaffiliated", "WMF-affiliated"))+ + theme_minimal() + + theme(legend.position = "top") + + labs( + title = "PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)", + x = "Casual v. Formal Updates (PC3)", + y = "Technical-matter v. Procedural Commentary (PC4)", + ) + +main_df |> + filter(comment_type=="task_subcomment") |> + ggplot( + aes( + x = PC4, + y = PC3, + fill = as.factor(ADAC) + ) + ) + + facet_grid(isAuthorWMF~source, + labeller = as_labeller(c( + "c1" = "VisualEditor (c1)", + "c2" = "HTTPS-as-default (c2)", + "c3" = "HTTP-deprecation (c3)" + ))) + + geom_point(shape = 21, alpha=0.13, size=2) + + scale_fill_viridis_d( + option = "turbo", + name = "By Task Author Before Resolution", + labels = c("No", "Yes"))+ + theme_minimal() + + theme(legend.position = "top") + + labs( + title = "PCs for Replies (by Author Affiliation, Case, and Comment Type)", + x = "Casual v. Formal Updates (PC3)", + y = "Technical-matter v. Procedural Commentary (PC4)", + ) + main_df <- main_df |> mutate( comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+"))