1
0

updates to commit modeling for ve

This commit is contained in:
Matthew Gaughan 2025-03-12 15:15:19 -07:00
parent 065616558e
commit 04a74ef097
22 changed files with 594 additions and 101 deletions

BIN
.RData

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

View File

@ -0,0 +1,9 @@
"","effect","group","level","term","estimate","std.error","conf.low","conf.high"
"1","ran_vals","commit_type","other_commit_count","(Intercept)",0.364855157809777,0.0448820171494965,0.276888020643255,0.4528222949763
"2","ran_vals","commit_type","wikimedia_commit_count","(Intercept)",-0.364855157809562,0.0597894025576017,-0.482040233479628,-0.247670082139495
"3","ran_vals","commit_type","other_commit_count","before_after",0.352924544901635,0.0434143938187066,0.267833896606332,0.438015193196938
"4","ran_vals","commit_type","wikimedia_commit_count","before_after",-0.352924544901426,0.0578343138752091,-0.466277717167421,-0.239571372635431
"5","ran_vals","commit_type","other_commit_count","relative_week",0.020534807942067,0.00252605338611267,0.0155838342822607,0.0254857816018733
"6","ran_vals","commit_type","wikimedia_commit_count","relative_week",-0.0205348079420548,0.00336507207192973,-0.0271302280084187,-0.013939387875691
"7","ran_vals","commit_type","other_commit_count","before_after:relative_week",-0.0610245395646896,0.00750682666355926,-0.0757376494634507,-0.0463114296659284
"8","ran_vals","commit_type","wikimedia_commit_count","before_after:relative_week",0.0610245395646535,0.0100001896117208,0.0414245280871091,0.0806245510421979
1 effect group level term estimate std.error conf.low conf.high
2 1 ran_vals commit_type other_commit_count (Intercept) 0.364855157809777 0.0448820171494965 0.276888020643255 0.4528222949763
3 2 ran_vals commit_type wikimedia_commit_count (Intercept) -0.364855157809562 0.0597894025576017 -0.482040233479628 -0.247670082139495
4 3 ran_vals commit_type other_commit_count before_after 0.352924544901635 0.0434143938187066 0.267833896606332 0.438015193196938
5 4 ran_vals commit_type wikimedia_commit_count before_after -0.352924544901426 0.0578343138752091 -0.466277717167421 -0.239571372635431
6 5 ran_vals commit_type other_commit_count relative_week 0.020534807942067 0.00252605338611267 0.0155838342822607 0.0254857816018733
7 6 ran_vals commit_type wikimedia_commit_count relative_week -0.0205348079420548 0.00336507207192973 -0.0271302280084187 -0.013939387875691
8 7 ran_vals commit_type other_commit_count before_after:relative_week -0.0610245395646896 0.00750682666355926 -0.0757376494634507 -0.0463114296659284
9 8 ran_vals commit_type wikimedia_commit_count before_after:relative_week 0.0610245395646535 0.0100001896117208 0.0414245280871091 0.0806245510421979

View File

@ -0,0 +1,9 @@
"","effect","group","level","term","estimate","std.error","conf.low","conf.high"
"1","ran_vals","commit_type","other_commit_count","(Intercept)",0.0685032418929246,0.0181064985041167,0.0330151569387276,0.103991326847122
"2","ran_vals","commit_type","wikimedia_commit_count","(Intercept)",-0.0685032418929394,0.0191716559555347,-0.10607899708978,-0.0309274866960986
"3","ran_vals","commit_type","other_commit_count","before_after",0.153698714908258,0.0406250196955685,0.0740751394337131,0.233322290382802
"4","ran_vals","commit_type","wikimedia_commit_count","before_after",-0.153698714908291,0.0430148822329188,-0.238006334884044,-0.0693910949325384
"5","ran_vals","commit_type","other_commit_count","relative_week",0.00806617338695213,0.00213201816920638,0.00388749456092261,0.0122448522129817
"6","ran_vals","commit_type","wikimedia_commit_count","relative_week",-0.00806617338695388,0.00225743916316379,-0.0124906728440451,-0.00364167392986262
"7","ran_vals","commit_type","other_commit_count","before_after:relative_week",-0.00542528453738445,0.00143398916095857,-0.00823585164708405,-0.00261471742768485
"8","ran_vals","commit_type","wikimedia_commit_count","before_after:relative_week",0.00542528453738562,0.00151834695324874,0.00244937919298198,0.00840118988178927
1 effect group level term estimate std.error conf.low conf.high
2 1 ran_vals commit_type other_commit_count (Intercept) 0.0685032418929246 0.0181064985041167 0.0330151569387276 0.103991326847122
3 2 ran_vals commit_type wikimedia_commit_count (Intercept) -0.0685032418929394 0.0191716559555347 -0.10607899708978 -0.0309274866960986
4 3 ran_vals commit_type other_commit_count before_after 0.153698714908258 0.0406250196955685 0.0740751394337131 0.233322290382802
5 4 ran_vals commit_type wikimedia_commit_count before_after -0.153698714908291 0.0430148822329188 -0.238006334884044 -0.0693910949325384
6 5 ran_vals commit_type other_commit_count relative_week 0.00806617338695213 0.00213201816920638 0.00388749456092261 0.0122448522129817
7 6 ran_vals commit_type wikimedia_commit_count relative_week -0.00806617338695388 0.00225743916316379 -0.0124906728440451 -0.00364167392986262
8 7 ran_vals commit_type other_commit_count before_after:relative_week -0.00542528453738445 0.00143398916095857 -0.00823585164708405 -0.00261471742768485
9 8 ran_vals commit_type wikimedia_commit_count before_after:relative_week 0.00542528453738562 0.00151834695324874 0.00244937919298198 0.00840118988178927

View File

@ -0,0 +1,9 @@
"","effect","group","level","term","estimate","std.error","conf.low","conf.high"
"1","ran_vals","commit_type","other_commit_count","(Intercept)",0.19142744594005,0.141824355514081,-0.0865431829981536,0.469398074878253
"2","ran_vals","commit_type","wikimedia_commit_count","(Intercept)",-0.191427445940039,0.144215326442934,-0.474084291786877,0.091229399906799
"3","ran_vals","commit_type","other_commit_count","before_after",-0.0755920578722042,0.0560044816831836,-0.185358824944077,0.0341747091996687
"4","ran_vals","commit_type","wikimedia_commit_count","before_after",0.0755920578721999,0.0569486431221849,-0.0360252316157072,0.187209347360107
"5","ran_vals","commit_type","other_commit_count","relative_week",0.0275202739990639,0.0203891615662596,-0.0124417483457733,0.0674822963439011
"6","ran_vals","commit_type","wikimedia_commit_count","relative_week",-0.0275202739990623,0.0207328958451281,-0.0681560031507335,0.0131154551526089
"7","ran_vals","commit_type","other_commit_count","before_after:relative_week",-0.030560534526985,0.0226416232645461,-0.0749373006770195,0.0138162316230495
"8","ran_vals","commit_type","wikimedia_commit_count","before_after:relative_week",0.0305605345269833,0.0230233310665026,-0.014564365167504,0.0756854342214706
1 effect group level term estimate std.error conf.low conf.high
2 1 ran_vals commit_type other_commit_count (Intercept) 0.19142744594005 0.141824355514081 -0.0865431829981536 0.469398074878253
3 2 ran_vals commit_type wikimedia_commit_count (Intercept) -0.191427445940039 0.144215326442934 -0.474084291786877 0.091229399906799
4 3 ran_vals commit_type other_commit_count before_after -0.0755920578722042 0.0560044816831836 -0.185358824944077 0.0341747091996687
5 4 ran_vals commit_type wikimedia_commit_count before_after 0.0755920578721999 0.0569486431221849 -0.0360252316157072 0.187209347360107
6 5 ran_vals commit_type other_commit_count relative_week 0.0275202739990639 0.0203891615662596 -0.0124417483457733 0.0674822963439011
7 6 ran_vals commit_type wikimedia_commit_count relative_week -0.0275202739990623 0.0207328958451281 -0.0681560031507335 0.0131154551526089
8 7 ran_vals commit_type other_commit_count before_after:relative_week -0.030560534526985 0.0226416232645461 -0.0749373006770195 0.0138162316230495
9 8 ran_vals commit_type wikimedia_commit_count before_after:relative_week 0.0305605345269833 0.0230233310665026 -0.014564365167504 0.0756854342214706

Binary file not shown.

View File

@ -0,0 +1,9 @@
"","effect","group","level","term","estimate","std.error","conf.low","conf.high"
"1","ran_vals","commit_type","other_commit_count","(Intercept)",0,0,0,0
"2","ran_vals","commit_type","wikimedia_commit_count","(Intercept)",0,0,0,0
"3","ran_vals","commit_type","other_commit_count","before_after",0.0119022181828498,0.0145794598343909,-0.0166729980066047,0.0404774343723042
"4","ran_vals","commit_type","wikimedia_commit_count","before_after",-0.0119022181828497,0.0146331623043827,-0.0405826892793688,0.0167782529136695
"5","ran_vals","commit_type","other_commit_count","relative_week",-0.000674853711220998,0.000826652849473679,-0.00229506352390682,0.000945356101464824
"6","ran_vals","commit_type","wikimedia_commit_count","relative_week",0.000674853711220992,0.000829697770228994,-0.000951324036481026,0.00230103145892301
"7","ran_vals","commit_type","other_commit_count","before_after:relative_week",0.0134416522215151,0.0164651685602625,-0.0188294851559806,0.0457127895990109
"8","ran_vals","commit_type","wikimedia_commit_count","before_after:relative_week",-0.013441652221515,0.0165258169128779,-0.0458316581858585,0.0189483537428285
1 effect group level term estimate std.error conf.low conf.high
2 1 ran_vals commit_type other_commit_count (Intercept) 0 0 0 0
3 2 ran_vals commit_type wikimedia_commit_count (Intercept) 0 0 0 0
4 3 ran_vals commit_type other_commit_count before_after 0.0119022181828498 0.0145794598343909 -0.0166729980066047 0.0404774343723042
5 4 ran_vals commit_type wikimedia_commit_count before_after -0.0119022181828497 0.0146331623043827 -0.0405826892793688 0.0167782529136695
6 5 ran_vals commit_type other_commit_count relative_week -0.000674853711220998 0.000826652849473679 -0.00229506352390682 0.000945356101464824
7 6 ran_vals commit_type wikimedia_commit_count relative_week 0.000674853711220992 0.000829697770228994 -0.000951324036481026 0.00230103145892301
8 7 ran_vals commit_type other_commit_count before_after:relative_week 0.0134416522215151 0.0164651685602625 -0.0188294851559806 0.0457127895990109
9 8 ran_vals commit_type wikimedia_commit_count before_after:relative_week -0.013441652221515 0.0165258169128779 -0.0458316581858585 0.0189483537428285

View File

@ -2,8 +2,9 @@
library(dplyr) library(dplyr)
library(lubridate) library(lubridate)
library(tidyr) library(tidyr)
library(purrr)
ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/extensions_visualeditor_commits.csv" ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/mediawiki_core_commits.csv"
transform_commit_data <- function(filepath){ transform_commit_data <- function(filepath){
#basic, loading in the file #basic, loading in the file
@ -47,7 +48,14 @@ transform_commit_data <- function(filepath){
} }
df <- df |> df <- df |>
mutate(relative_week = relative_week(commit_date, event_date)) mutate(relative_week = relative_week(commit_date, event_date)) |>
arrange(relative_week) |>
group_by(author_email) |>
mutate(first_commit_date = first(commit_date),
new_author = as.numeric(difftime(commit_date, first_commit_date, units = "days") <= 7),
new_author_wmf = if_else(grepl("@wikimedia", author_email), new_author, 0),
new_author_unaff = if_else(!grepl("@wikimedia", author_email), new_author, 0)) |>
ungroup()
#filler for when there are weeks without commits #filler for when there are weeks without commits
all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date)) all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date))
@ -57,19 +65,18 @@ transform_commit_data <- function(filepath){
#for each week, get the list of unique authors that committed #for each week, get the list of unique authors that committed
cumulative_authors <- df %>% #cumulative_authors <- df %>%
arrange(relative_week) %>% # arrange(relative_week) %>%
group_by(relative_week) %>% # group_by(relative_week) %>%
summarize(cumulative_author_emails = list(unique(author_email)), .groups = 'drop') # summarize(cumulative_author_emails = list(unique(author_email)), .groups = 'drop')
#same for each committer #same for each committer
cumulative_committers <- df %>% #cumulative_committers <- df %>%
arrange(relative_week) %>% # arrange(relative_week) %>%
group_by(relative_week) %>% # group_by(relative_week) %>%
summarize(cumulative_committer_emails = list(unique(committer_email)), .groups = 'drop') # summarize(cumulative_committer_emails = list(unique(committer_email)), .groups = 'drop')
#now cut out the commit data that we don't care about #now cut out the commit data that we don't care about
df <- df |> df <- df |>
filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date) |>
filter(author_email != "jenkins-bot@gerrit.wikimedia.org") filter(author_email != "jenkins-bot@gerrit.wikimedia.org")
#in order: #in order:
@ -84,9 +91,11 @@ transform_commit_data <- function(filepath){
author_emails = list(unique(author_email)), author_emails = list(unique(author_email)),
committer_emails = list(unique(committer_email)), committer_emails = list(unique(committer_email)),
mediawiki_dev_commit_count = sum(grepl("@users.mediawiki.org", author_email)), mediawiki_dev_commit_count = sum(grepl("@users.mediawiki.org", author_email)),
wikimedia_commit_count = sum(grepl("@wikimedia.org|@wikimedia.de", author_email)), wikimedia_commit_count = sum(grepl("@wikimedia", author_email)),
wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)), wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)),
bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)), bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)),
wmf_ft_commit_count = sum(new_author_wmf),
unaff_ft_commit_count = sum(new_author_unaff),
.groups = 'drop') |> .groups = 'drop') |>
right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |> right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |>
replace_na(list(commit_count = 0)) |> replace_na(list(commit_count = 0)) |>
@ -95,44 +104,47 @@ transform_commit_data <- function(filepath){
replace_na(list(jenkins_commit_count = 0)) |> replace_na(list(jenkins_commit_count = 0)) |>
replace_na(list(mediawiki_dev_commit_count = 0)) |> replace_na(list(mediawiki_dev_commit_count = 0)) |>
replace_na(list(wikia_commit_count = 0)) |> replace_na(list(wikia_commit_count = 0)) |>
mutate(before_after = if_else(relative_week < 0, 0, 1)) replace_na(list(wmf_ft_commit_count = 0)) |>
replace_na(list(unaff_ft_commit_count = 0)) |>
mutate(before_after = if_else(relative_week < 0, 0, 1)) |>
select(-author_emails, -committer_emails)
# then, to get the authorship details in # then, to get the authorship details in
# we check if the email data is present, if not we fill in blank # we check if the email data is present, if not we fill in blank
# we bring in the information about authorship lists that we already had # we bring in the information about authorship lists that we already had
# then comparing the current week's author list with the previous week's cumulative list, or empty # then comparing the current week's author list with the previous week's cumulative list, or empty
# ---- the length of that difference is the 'new' value # ---- the length of that difference is the 'new' value
# then we delete out the author list information # then we delete out the author list information
weekly_with_authorship <- weekly_commits |> #weekly_with_authorship <- weekly_commits |>
mutate( # mutate(
author_emails = ifelse(is.na(author_emails), list(character()), author_emails), # author_emails = ifelse(is.na(author_emails), list(character()), author_emails),
committer_emails = ifelse(is.na(committer_emails), list(character()), committer_emails) # committer_emails = ifelse(is.na(committer_emails), list(character()), committer_emails)
) |> # ) |>
left_join(cumulative_authors, by = "relative_week") |> # left_join(cumulative_authors, by = "relative_week") |>
left_join(cumulative_committers, by = "relative_week") |> # left_join(cumulative_committers, by = "relative_week") |>
mutate(new_author_emails = mapply(function(x, y) length(setdiff(x, y)), author_emails, lag(cumulative_author_emails, default = list(character(1)))), # mutate(new_author_emails = mapply(function(x, y) length(setdiff(x, y)), author_emails, lag(cumulative_author_emails, default = list(character(1)))),
new_committer_emails = mapply(function(x, y) length(setdiff(x, y)), committer_emails, lag(cumulative_committer_emails, default = list(character(1))))) # new_committer_emails = mapply(function(x, y) length(setdiff(x, y)), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))))
weekly_with_authorship <- weekly_with_authorship |> #weekly_with_authorship <- weekly_with_authorship |>
mutate( # mutate(
wikimedia_author_emails = mapply(function(x) length(grep("@wikimedia.org", x)), author_emails), # wikimedia_author_emails = mapply(function(x) length(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x)), author_emails),
non_wikimedia_author_emails = mapply(function(x) length(x) - length(grep("@wikimedia.org", x)), author_emails), # non_wikimedia_author_emails = mapply(function(x) length(x) - length(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x)), author_emails),
wikimedia_committer_emails = mapply(function(x) length(grep("@wikimedia.org", x)), committer_emails), # wikimedia_committer_emails = mapply(function(x) length(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x)), committer_emails),
non_wikimedia_committer_emails = mapply(function(x) length(x) - length(grep("@wikimedia.org", x)), committer_emails), # non_wikimedia_committer_emails = mapply(function(x) length(x) - length(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x)), committer_emails),
new_wikimedia_authors = mapply(function(x, y) length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))), # new_wikimedia_authors = mapply(function(x, y) length(setdiff(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x, value = TRUE), grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))),
new_non_wikimedia_authors = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))), # new_non_wikimedia_authors = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x, value = TRUE), grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))),
new_wikimedia_committers = mapply(function(x, y) length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))), # new_wikimedia_committers = mapply(function(x, y) length(setdiff(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x, value = TRUE), grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))),
new_non_wikimedia_committers = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))) # new_non_wikimedia_committers = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x, value = TRUE), grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1))))
) |> # ) |>
select(-author_emails, -committer_emails, -cumulative_author_emails, -cumulative_committer_emails)
weekly_commits <- weekly_commits |>
filter(relative_week >= (-52) & relative_week <= 52 )
#gracefully exit #gracefully exit
return(weekly_with_authorship) return(weekly_commits)
} }
test <- read.csv(ve_commit_fp, header = TRUE)
transformed <- transform_commit_data(ve_commit_fp) transformed <- transform_commit_data(ve_commit_fp)
output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0312_mediawiki_core_weekly_commit_count_data.csv"
write.csv(transformed, output_filepath, row.names = FALSE) write.csv(transformed, output_filepath, row.names = FALSE)

View File

@ -1,17 +1,18 @@
library(tidyverse) library(tidyverse)
entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_mediawiki_core_weekly_commit_count_data.csv" library(dplyr)
entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0312_extensions_ve_weekly_commit_count_data.csv"
entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing")
widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_mediawiki_core_weekly_commit_count_data.csv" widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0312_extensions_ve_weekly_commit_count_data.csv"
widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing")
event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_mediawiki_core_weekly_commit_count_data.csv" event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0312_extensions_ve_weekly_commit_count_data.csv"
event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default")
#input_df <- bind_rows(entest_df, widetest_df, event_df) #input_df <- bind_rows(entest_df, widetest_df, event_df)
#dropping the event (2013-07-01) from the modeling #dropping the event (2013-07-01) from the modeling
input_df <- bind_rows(entest_df, widetest_df) #input_df <- bind_rows(entest_df, widetest_df)
#input_df <- event_df input_df <- event_df
input_df <- input_df |> input_df <- input_df |>
mutate(nonbot_commit_count = commit_count - bot_commit_count)|> mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
@ -27,7 +28,7 @@ library(dplyr)
#get into mlm format #get into mlm format
long_df <- input_df |> long_df <- input_df |>
pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), tidyr::pivot_longer(cols = c(other_commit_count, wikimedia_commit_count, unaff_ft_commit_count),
names_to = "commit_type", names_to = "commit_type",
values_to = "lengthened_commit_count") values_to = "lengthened_commit_count")
@ -38,24 +39,28 @@ intermediate_long_df <- long_df |>
library(rdd) library(rdd)
var(intermediate_long_df$lengthened_commit_count) # 1253.343 var(intermediate_long_df$lengthened_commit_count)
mean(intermediate_long_df$lengthened_commit_count) # 44.92381 mean(intermediate_long_df$lengthened_commit_count)
median(intermediate_long_df$lengthened_commit_count) # 39.5 median(intermediate_long_df$lengthened_commit_count)
get_optimal_bandwidth <- function(df){ get_optimal_bandwidth <- function(df){
bw <- tryCatch({ bw <- tryCatch({
IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular") IKbandwidth(df$relative_week, df$lengthened_commit_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
}, error = function(e) { }, error = function(e) {
NA NA
}) })
} }
intermediate_long_df <- intermediate_long_df |>
filter(commit_type != "unaff_ft_commit_count")
optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df)
library(dplyr)
window_num <- 4 window_num <- 10
final_long_df <- intermediate_long_df |> final_long_df <- intermediate_long_df |>
filter(relative_week >= (- window_num) & relative_week <= (window_num)) filter(relative_week >= (-window_num) & relative_week <= window_num) |>
filter(commit_type != "unaff_ft_commit_count")
library(fitdistrplus) library(fitdistrplus)
descdist(final_long_df$lengthened_commit_count, discrete=FALSE) descdist(final_long_df$lengthened_commit_count, discrete=FALSE)
@ -70,14 +75,24 @@ mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week +
optCtrl=list(maxfun=2e5)), nAGQ=0, optCtrl=list(maxfun=2e5)), nAGQ=0,
data=final_long_df) data=final_long_df)
#(before_after*relative_week|rd_event) #(before_after*relative_week|rd_event)
#mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+
# (before_after*relative_week|commit_type) + saveRDS(mlm, file = "0312_ve_ve_event_commits_mlm.rds")
# (before_after*relative_week|rd_event) ,data=long_df) #mlm <- readRDS("commit_analysis/case1/0312_core_ve_testing_commits_mlm.rds")
summary(mlm) summary(mlm)
qqnorm(residuals(mlm)) qqnorm(residuals(mlm))
res <- ranef(mlm) res <- ranef(mlm)
print(res) print(res)
library(broom.mixed)
library(ggplot2)
condvals <- broom.mixed::tidy(mlm, effects = "ran_vals", conf.int = TRUE)
glmer_ranef_ba <- condvals
write.csv(glmer_ranef_ba, "0312_ve_ve_event_ba_ranefs.csv")
texreg(mlm) texreg(mlm)
#final_long_df <- final_long_df |> #final_long_df <- final_long_df |>
# drop_na() # drop_na()
@ -116,34 +131,3 @@ summary(other_share_lmer)
qqnorm(residuals(other_share_lm)) qqnorm(residuals(other_share_lm))
texreg(other_share_lm) texreg(other_share_lm)
#power analysis
#library(simr)
#simrOptions(progress=FALSE)
## Intercept and slopes for intervention, time1, time2, intervention:time1, intervention:time2
#wmf_fixed <- c(0.511, -0.166, 0.002, 0.007)
## Random intercepts for participants clustered by class
#wmf_rand <- matrix(c(
# 0.01, 0.005, 0.002, 0.001,
# 0.005, 0.02, 0.003, 0.004,
# 0.002, 0.003, 0.015, 0.006,
# 0.001, 0.004, 0.006, 0.01
#), nrow=4, byrow=TRUE)
## residual variance
#wmf_res <- 0.2065
#wmf_model <- makeLmer(commit_share ~ before_after*relative_week + (before_after*relative_week | rd_event),
# fixef=wmf_fixed, VarCorr=wmf_rand, sigma=wmf_res, data=wikimedia_long_df)
#sim_treat <- powerSim(wmf_model, nsim=100, test = fcompare(commit_share~relative_week))
#sim_treat
#model_ext_subj <- extend(wmf_model, within="rd_event+before_after+relative_week", n=30)
#sim_treat_subj <- powerSim(model_ext_subj, nsim=100, test = fcompare(commit_share~before_after*relative_week))
#sim_treat_subj
#p_curve_treat <- powerCurve(model_ext_subj, test=fcompare(commit_share~before_after*relative_week),
# within="rd_event+before_after+relative_week",
# breaks=c(5,10,15,20))
#plot(p_curve_treat)

View File

@ -1,11 +1,11 @@
library(tidyverse) library(tidyverse)
entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0312_mediawiki_core_weekly_commit_count_data.csv"
entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing")
widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv" widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0312_mediawiki_core_weekly_commit_count_data.csv"
widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing")
event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0312_mediawiki_core_weekly_commit_count_data.csv"
event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default")
#input_df <- bind_rows(entest_df, widetest_df, event_df) #input_df <- bind_rows(entest_df, widetest_df, event_df)
@ -21,24 +21,29 @@ input_df <- input_df |>
dplyr::select(-wikia_commit_count) dplyr::select(-wikia_commit_count)
#get into mlm format #get into mlm format
#long_df <- input_df |>
# tidyr::pivot_longer(cols = c(other_commit_count, wikimedia_commit_count),
# names_to = "commit_type",
# values_to = "lengthened_commit_count")
long_df <- input_df |> long_df <- input_df |>
pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), tidyr::pivot_longer(cols = c(wmf_ft_commit_count, unaff_ft_commit_count, nonbot_commit_count),
names_to = "commit_type", names_to = "commit_type",
values_to = "lengthened_commit_count") values_to = "lengthened_commit_count")
intermediate_long_df <- long_df |> intermediate_long_df <- long_df
mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> #|>
mutate(log_commits = log1p(lengthened_commit_count))|> # mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |>
mutate(scaled_long_commits = lengthened_commit_count / 10) # mutate(log_commits = log1p(lengthened_commit_count))|>
# mutate(scaled_long_commits = lengthened_commit_count / 10)
window_num <- 4 window_num <- 8
final_long_df <- intermediate_long_df |> final_long_df <- intermediate_long_df |>
filter(relative_week >= (- window_num) & relative_week <= (window_num)) filter(relative_week >= (- window_num) & relative_week <= (window_num))
affiliationColors <- affiliationColors <-
setNames( c('#5da2d8', '#c7756a') setNames( c('#5da2d8', '#c7756a', 'black')
,c("other_commit_count", "wikimedia_commit_count")) ,c("unaff_ft_commit_count", "wmf_ft_commit_count", "nonbot_commit_count"))
commit_plot <- final_long_df |> commit_plot <- final_long_df |>
@ -52,11 +57,14 @@ commit_plot <- final_long_df |>
scale_color_manual(values = affiliationColors, scale_color_manual(values = affiliationColors,
labels = c("other_commit_count" = "Unaffiliated", "wikimedia_commit_count" = "WMF Affiliated")) + labels = c("other_commit_count" = "Unaffiliated", "wikimedia_commit_count" = "WMF Affiliated")) +
scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) + scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +
ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events (by Affiliation)") + ggtitle("VisualEditor New Contributors Commits Around Opt-In Testing Events (by Affiliation)") +
theme_bw() + theme_bw() +
theme(legend.position = "top") theme(legend.position = "top")
commit_plot commit_plot
ggsave(filename = "0312-core-ve-testing-new-commits.png", plot = commit_plot, width = 15, height = 9, dpi = 800)
total_commit_plot <- final_long_df |> total_commit_plot <- final_long_df |>
ggplot(aes(x=relative_week, ggplot(aes(x=relative_week,
y=nonbot_commit_count, y=nonbot_commit_count,
@ -74,8 +82,6 @@ total_commit_plot
ggsave(filename = "0305-ve-total-commits.png", plot = total_commit_plot, width = 15, height = 9, dpi = 800) ggsave(filename = "0305-ve-total-commits.png", plot = total_commit_plot, width = 15, height = 9, dpi = 800)
final_long_df <- final_long_df|>
drop_na()
commit_share_plot <- final_long_df |> commit_share_plot <- final_long_df |>
ggplot(aes(x=relative_week, ggplot(aes(x=relative_week,
@ -93,4 +99,4 @@ commit_share_plot <- final_long_df |>
theme(legend.position = "top") theme(legend.position = "top")
commit_share_plot commit_share_plot
ggsave(filename = "0305-ve-testing-share.png", plot = commit_share_plot, width = 12, height = 9, dpi = 800) ggsave(filename = "0312-ve-testing-share.png", plot = commit_share_plot, width = 12, height = 9, dpi = 800)

View File

@ -0,0 +1,17 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:57439 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: Q5pJ0QUFUjx7RBFTzpvm
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 24802792

View File

@ -0,0 +1,219 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import pandas as pd \n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/spacy/util.py:910: UserWarning: [W095] Model 'en_coreference_web_trf' (3.4.0a2) was trained with spaCy v3.3.0 and may not be 100% compatible with the current version (3.7.5). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
" warnings.warn(warn_msg)\n"
]
},
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495edce13c0>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp = spacy.load(\"en_core_web_trf\")\n",
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
"\n",
"# use replace_listeners for the coref components\n",
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
"\n",
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
"nlp.add_pipe(\"merge_entities\")\n",
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/explosion/spaCy/discussions/13572\n",
"# https://github.com/explosion/spaCy/issues/13111 \n",
"# https://explosion.ai/blog/coref\n",
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
"source": [
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"from spacy.tokens import Doc\n",
"# Define lightweight function for resolving references in text\n",
"def resolve_references(doc: Doc) -> str:\n",
" \"\"\"Function for resolving references with the coref ouput\n",
" doc (Doc): The Doc object processed by the coref pipeline\n",
" RETURNS (str): The Doc string with resolved references\n",
" \"\"\"\n",
" # token.idx : token.text\n",
" token_mention_mapper = {}\n",
" output_string = \"\"\n",
" clusters = [\n",
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
" ]\n",
"\n",
" # Iterate through every found cluster\n",
" for cluster in clusters:\n",
" first_mention = cluster[0]\n",
" # Iterate through every other span in the cluster\n",
" for mention_span in list(cluster)[1:]:\n",
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
" \n",
" for token in mention_span[1:]:\n",
" # Set empty string for all the other tokens in mention_span\n",
" token_mention_mapper[token.idx] = \"\"\n",
"\n",
" # Iterate through every token in the Doc\n",
" for token in doc:\n",
" # Check if token exists in token_mention_mapper\n",
" if token.idx in token_mention_mapper:\n",
" output_string += token_mention_mapper[token.idx]\n",
" # Else add original token text\n",
" else:\n",
" output_string += token.text + token.whitespace_\n",
"\n",
" return output_string\n"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
"source": [
"def resolving_comment(text):\n",
" doc = nlp(text)\n",
" resolved_text = resolve_references(doc)\n",
" return resolved_text"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/thinc/shims/pytorch.py:114: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
" with torch.cuda.amp.autocast(self._mixed_precision):\n"
]
}
],
"source": [
"phab_df['text'] = phab_df['comment_text'].apply(str)\n",
"phab_df['resolved_text'] = phab_df['text'].apply(resolving_comment)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -837,7 +837,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.21" "version": "3.10.16"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -0,0 +1,219 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import pandas as pd \n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/spacy/util.py:910: UserWarning: [W095] Model 'en_coreference_web_trf' (3.4.0a2) was trained with spaCy v3.3.0 and may not be 100% compatible with the current version (3.7.5). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
" warnings.warn(warn_msg)\n"
]
},
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495edce13c0>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp = spacy.load(\"en_core_web_trf\")\n",
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
"\n",
"# use replace_listeners for the coref components\n",
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
"\n",
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
"nlp.add_pipe(\"merge_entities\")\n",
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/explosion/spaCy/discussions/13572\n",
"# https://github.com/explosion/spaCy/issues/13111 \n",
"# https://explosion.ai/blog/coref\n",
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
"source": [
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"from spacy.tokens import Doc\n",
"# Define lightweight function for resolving references in text\n",
"def resolve_references(doc: Doc) -> str:\n",
" \"\"\"Function for resolving references with the coref ouput\n",
" doc (Doc): The Doc object processed by the coref pipeline\n",
" RETURNS (str): The Doc string with resolved references\n",
" \"\"\"\n",
" # token.idx : token.text\n",
" token_mention_mapper = {}\n",
" output_string = \"\"\n",
" clusters = [\n",
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
" ]\n",
"\n",
" # Iterate through every found cluster\n",
" for cluster in clusters:\n",
" first_mention = cluster[0]\n",
" # Iterate through every other span in the cluster\n",
" for mention_span in list(cluster)[1:]:\n",
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
" \n",
" for token in mention_span[1:]:\n",
" # Set empty string for all the other tokens in mention_span\n",
" token_mention_mapper[token.idx] = \"\"\n",
"\n",
" # Iterate through every token in the Doc\n",
" for token in doc:\n",
" # Check if token exists in token_mention_mapper\n",
" if token.idx in token_mention_mapper:\n",
" output_string += token_mention_mapper[token.idx]\n",
" # Else add original token text\n",
" else:\n",
" output_string += token.text + token.whitespace_\n",
"\n",
" return output_string\n"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
"source": [
"def resolving_comment(text):\n",
" doc = nlp(text)\n",
" resolved_text = resolve_references(doc)\n",
" return resolved_text"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/thinc/shims/pytorch.py:114: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
" with torch.cuda.amp.autocast(self._mixed_precision):\n"
]
}
],
"source": [
"phab_df['text'] = phab_df['comment_text'].apply(str)\n",
"phab_df['resolved_text'] = phab_df['text'].apply(resolving_comment)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1811,7 +1811,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.21" "version": "3.10.16"
} }
}, },
"nbformat": 4, "nbformat": 4,