updates to commit modeling for ve
This commit is contained in:
		
							parent
							
								
									065616558e
								
							
						
					
					
						commit
						04a74ef097
					
				
										
											Binary file not shown.
										
									
								
							| Before Width: | Height: | Size: 1.1 MiB | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/0312-core-testing-share.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/0312-core-testing-share.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 1.2 MiB | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/0312-core-ve-testing-new-commits.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/0312-core-ve-testing-new-commits.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 1.2 MiB | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/0312-ve-testing-new-commits.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/0312-ve-testing-new-commits.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 1.1 MiB | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/0312-ve-testing-share.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/0312-ve-testing-share.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 1.1 MiB | 
							
								
								
									
										9
									
								
								commit_analysis/case1/0312_core_ve_event_ba_ranefs.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								commit_analysis/case1/0312_core_ve_event_ba_ranefs.csv
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | |||||||
|  | "","effect","group","level","term","estimate","std.error","conf.low","conf.high" | ||||||
|  | "1","ran_vals","commit_type","other_commit_count","(Intercept)",0.364855157809777,0.0448820171494965,0.276888020643255,0.4528222949763 | ||||||
|  | "2","ran_vals","commit_type","wikimedia_commit_count","(Intercept)",-0.364855157809562,0.0597894025576017,-0.482040233479628,-0.247670082139495 | ||||||
|  | "3","ran_vals","commit_type","other_commit_count","before_after",0.352924544901635,0.0434143938187066,0.267833896606332,0.438015193196938 | ||||||
|  | "4","ran_vals","commit_type","wikimedia_commit_count","before_after",-0.352924544901426,0.0578343138752091,-0.466277717167421,-0.239571372635431 | ||||||
|  | "5","ran_vals","commit_type","other_commit_count","relative_week",0.020534807942067,0.00252605338611267,0.0155838342822607,0.0254857816018733 | ||||||
|  | "6","ran_vals","commit_type","wikimedia_commit_count","relative_week",-0.0205348079420548,0.00336507207192973,-0.0271302280084187,-0.013939387875691 | ||||||
|  | "7","ran_vals","commit_type","other_commit_count","before_after:relative_week",-0.0610245395646896,0.00750682666355926,-0.0757376494634507,-0.0463114296659284 | ||||||
|  | "8","ran_vals","commit_type","wikimedia_commit_count","before_after:relative_week",0.0610245395646535,0.0100001896117208,0.0414245280871091,0.0806245510421979 | ||||||
| 
 | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/0312_core_ve_event_commits_mlm.rds
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/0312_core_ve_event_commits_mlm.rds
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										9
									
								
								commit_analysis/case1/0312_core_ve_testing_ba_ranefs.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								commit_analysis/case1/0312_core_ve_testing_ba_ranefs.csv
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | |||||||
|  | "","effect","group","level","term","estimate","std.error","conf.low","conf.high" | ||||||
|  | "1","ran_vals","commit_type","other_commit_count","(Intercept)",0.0685032418929246,0.0181064985041167,0.0330151569387276,0.103991326847122 | ||||||
|  | "2","ran_vals","commit_type","wikimedia_commit_count","(Intercept)",-0.0685032418929394,0.0191716559555347,-0.10607899708978,-0.0309274866960986 | ||||||
|  | "3","ran_vals","commit_type","other_commit_count","before_after",0.153698714908258,0.0406250196955685,0.0740751394337131,0.233322290382802 | ||||||
|  | "4","ran_vals","commit_type","wikimedia_commit_count","before_after",-0.153698714908291,0.0430148822329188,-0.238006334884044,-0.0693910949325384 | ||||||
|  | "5","ran_vals","commit_type","other_commit_count","relative_week",0.00806617338695213,0.00213201816920638,0.00388749456092261,0.0122448522129817 | ||||||
|  | "6","ran_vals","commit_type","wikimedia_commit_count","relative_week",-0.00806617338695388,0.00225743916316379,-0.0124906728440451,-0.00364167392986262 | ||||||
|  | "7","ran_vals","commit_type","other_commit_count","before_after:relative_week",-0.00542528453738445,0.00143398916095857,-0.00823585164708405,-0.00261471742768485 | ||||||
|  | "8","ran_vals","commit_type","wikimedia_commit_count","before_after:relative_week",0.00542528453738562,0.00151834695324874,0.00244937919298198,0.00840118988178927 | ||||||
| 
 | 
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										9
									
								
								commit_analysis/case1/0312_ve_ve_event_ba_ranefs.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								commit_analysis/case1/0312_ve_ve_event_ba_ranefs.csv
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | |||||||
|  | "","effect","group","level","term","estimate","std.error","conf.low","conf.high" | ||||||
|  | "1","ran_vals","commit_type","other_commit_count","(Intercept)",0.19142744594005,0.141824355514081,-0.0865431829981536,0.469398074878253 | ||||||
|  | "2","ran_vals","commit_type","wikimedia_commit_count","(Intercept)",-0.191427445940039,0.144215326442934,-0.474084291786877,0.091229399906799 | ||||||
|  | "3","ran_vals","commit_type","other_commit_count","before_after",-0.0755920578722042,0.0560044816831836,-0.185358824944077,0.0341747091996687 | ||||||
|  | "4","ran_vals","commit_type","wikimedia_commit_count","before_after",0.0755920578721999,0.0569486431221849,-0.0360252316157072,0.187209347360107 | ||||||
|  | "5","ran_vals","commit_type","other_commit_count","relative_week",0.0275202739990639,0.0203891615662596,-0.0124417483457733,0.0674822963439011 | ||||||
|  | "6","ran_vals","commit_type","wikimedia_commit_count","relative_week",-0.0275202739990623,0.0207328958451281,-0.0681560031507335,0.0131154551526089 | ||||||
|  | "7","ran_vals","commit_type","other_commit_count","before_after:relative_week",-0.030560534526985,0.0226416232645461,-0.0749373006770195,0.0138162316230495 | ||||||
|  | "8","ran_vals","commit_type","wikimedia_commit_count","before_after:relative_week",0.0305605345269833,0.0230233310665026,-0.014564365167504,0.0756854342214706 | ||||||
| 
 | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/0312_ve_ve_event_commits_mlm.rds
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/0312_ve_ve_event_commits_mlm.rds
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										9
									
								
								commit_analysis/case1/0312_ve_ve_testing_ba_ranefs.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								commit_analysis/case1/0312_ve_ve_testing_ba_ranefs.csv
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | |||||||
|  | "","effect","group","level","term","estimate","std.error","conf.low","conf.high" | ||||||
|  | "1","ran_vals","commit_type","other_commit_count","(Intercept)",0,0,0,0 | ||||||
|  | "2","ran_vals","commit_type","wikimedia_commit_count","(Intercept)",0,0,0,0 | ||||||
|  | "3","ran_vals","commit_type","other_commit_count","before_after",0.0119022181828498,0.0145794598343909,-0.0166729980066047,0.0404774343723042 | ||||||
|  | "4","ran_vals","commit_type","wikimedia_commit_count","before_after",-0.0119022181828497,0.0146331623043827,-0.0405826892793688,0.0167782529136695 | ||||||
|  | "5","ran_vals","commit_type","other_commit_count","relative_week",-0.000674853711220998,0.000826652849473679,-0.00229506352390682,0.000945356101464824 | ||||||
|  | "6","ran_vals","commit_type","wikimedia_commit_count","relative_week",0.000674853711220992,0.000829697770228994,-0.000951324036481026,0.00230103145892301 | ||||||
|  | "7","ran_vals","commit_type","other_commit_count","before_after:relative_week",0.0134416522215151,0.0164651685602625,-0.0188294851559806,0.0457127895990109 | ||||||
|  | "8","ran_vals","commit_type","wikimedia_commit_count","before_after:relative_week",-0.013441652221515,0.0165258169128779,-0.0458316581858585,0.0189483537428285 | ||||||
| 
 | 
										
											Binary file not shown.
										
									
								
							| @ -2,8 +2,9 @@ | |||||||
| library(dplyr) | library(dplyr) | ||||||
| library(lubridate) | library(lubridate) | ||||||
| library(tidyr) | library(tidyr) | ||||||
|  | library(purrr) | ||||||
| 
 | 
 | ||||||
| ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/extensions_visualeditor_commits.csv" | ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/mediawiki_core_commits.csv" | ||||||
| 
 | 
 | ||||||
| transform_commit_data <- function(filepath){ | transform_commit_data <- function(filepath){ | ||||||
|   #basic, loading in the file  |   #basic, loading in the file  | ||||||
| @ -47,7 +48,14 @@ transform_commit_data <- function(filepath){ | |||||||
|   } |   } | ||||||
|    |    | ||||||
|   df <- df |> |   df <- df |> | ||||||
|     mutate(relative_week = relative_week(commit_date, event_date)) |     mutate(relative_week = relative_week(commit_date, event_date)) |> | ||||||
|  |     arrange(relative_week) |> | ||||||
|  |     group_by(author_email) |> | ||||||
|  |     mutate(first_commit_date = first(commit_date), | ||||||
|  |            new_author = as.numeric(difftime(commit_date, first_commit_date, units = "days") <= 7), | ||||||
|  |            new_author_wmf = if_else(grepl("@wikimedia", author_email), new_author, 0), | ||||||
|  |            new_author_unaff = if_else(!grepl("@wikimedia", author_email), new_author, 0)) |> | ||||||
|  |     ungroup() | ||||||
|    |    | ||||||
|   #filler for when there are weeks without commits |   #filler for when there are weeks without commits | ||||||
|   all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date)) |   all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date)) | ||||||
| @ -57,19 +65,18 @@ transform_commit_data <- function(filepath){ | |||||||
| 
 | 
 | ||||||
|    |    | ||||||
|   #for each week, get the list of unique authors that committed |   #for each week, get the list of unique authors that committed | ||||||
|   cumulative_authors <- df %>% |   #cumulative_authors <- df %>% | ||||||
|     arrange(relative_week) %>% |   #  arrange(relative_week) %>% | ||||||
|     group_by(relative_week) %>% |   #  group_by(relative_week) %>% | ||||||
|     summarize(cumulative_author_emails = list(unique(author_email)), .groups = 'drop') |   #  summarize(cumulative_author_emails = list(unique(author_email)), .groups = 'drop') | ||||||
|   #same for each committer |   #same for each committer | ||||||
|   cumulative_committers <- df %>% |   #cumulative_committers <- df %>% | ||||||
|     arrange(relative_week) %>% |   #  arrange(relative_week) %>% | ||||||
|     group_by(relative_week) %>% |   #  group_by(relative_week) %>% | ||||||
|     summarize(cumulative_committer_emails = list(unique(committer_email)), .groups = 'drop') |   #  summarize(cumulative_committer_emails = list(unique(committer_email)), .groups = 'drop') | ||||||
|    |    | ||||||
|   #now cut out the commit data that we don't care about  |   #now cut out the commit data that we don't care about  | ||||||
|   df <- df |> |   df <- df |> | ||||||
|     filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date) |> |  | ||||||
|     filter(author_email != "jenkins-bot@gerrit.wikimedia.org") |     filter(author_email != "jenkins-bot@gerrit.wikimedia.org") | ||||||
|    |    | ||||||
|   #in order: |   #in order: | ||||||
| @ -84,9 +91,11 @@ transform_commit_data <- function(filepath){ | |||||||
|               author_emails = list(unique(author_email)), |               author_emails = list(unique(author_email)), | ||||||
|               committer_emails = list(unique(committer_email)), |               committer_emails = list(unique(committer_email)), | ||||||
|               mediawiki_dev_commit_count = sum(grepl("@users.mediawiki.org", author_email)), |               mediawiki_dev_commit_count = sum(grepl("@users.mediawiki.org", author_email)), | ||||||
|               wikimedia_commit_count = sum(grepl("@wikimedia.org|@wikimedia.de", author_email)), |               wikimedia_commit_count = sum(grepl("@wikimedia", author_email)), | ||||||
|               wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)), |               wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)), | ||||||
|               bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)), |               bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)), | ||||||
|  |               wmf_ft_commit_count = sum(new_author_wmf), | ||||||
|  |               unaff_ft_commit_count = sum(new_author_unaff), | ||||||
|               .groups = 'drop') |> |               .groups = 'drop') |> | ||||||
|     right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |> |     right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |> | ||||||
|     replace_na(list(commit_count = 0)) |> |     replace_na(list(commit_count = 0)) |> | ||||||
| @ -95,44 +104,47 @@ transform_commit_data <- function(filepath){ | |||||||
|     replace_na(list(jenkins_commit_count = 0)) |> |     replace_na(list(jenkins_commit_count = 0)) |> | ||||||
|     replace_na(list(mediawiki_dev_commit_count = 0)) |> |     replace_na(list(mediawiki_dev_commit_count = 0)) |> | ||||||
|     replace_na(list(wikia_commit_count = 0)) |> |     replace_na(list(wikia_commit_count = 0)) |> | ||||||
|     mutate(before_after = if_else(relative_week < 0, 0, 1))  |     replace_na(list(wmf_ft_commit_count = 0)) |> | ||||||
|  |     replace_na(list(unaff_ft_commit_count = 0)) |> | ||||||
|  |     mutate(before_after = if_else(relative_week < 0, 0, 1)) |> | ||||||
|  |     select(-author_emails, -committer_emails) | ||||||
|   # then, to get the authorship details in |   # then, to get the authorship details in | ||||||
|   # we check if the email data is present, if not we fill in blank  |   # we check if the email data is present, if not we fill in blank  | ||||||
|   # we bring in the information about authorship lists that we already had  |   # we bring in the information about authorship lists that we already had  | ||||||
|   # then comparing the current week's author list with the previous week's cumulative list, or empty |   # then comparing the current week's author list with the previous week's cumulative list, or empty | ||||||
|   #  ---- the length of that difference is the 'new' value |   #  ---- the length of that difference is the 'new' value | ||||||
|   # then we delete out the author list information |   # then we delete out the author list information | ||||||
|   weekly_with_authorship <- weekly_commits |> |   #weekly_with_authorship <- weekly_commits |> | ||||||
|     mutate( |   #  mutate( | ||||||
|       author_emails = ifelse(is.na(author_emails), list(character()), author_emails), |   #    author_emails = ifelse(is.na(author_emails), list(character()), author_emails), | ||||||
|       committer_emails = ifelse(is.na(committer_emails), list(character()), committer_emails) |   #    committer_emails = ifelse(is.na(committer_emails), list(character()), committer_emails) | ||||||
|     ) |> |   #  ) |> | ||||||
|     left_join(cumulative_authors, by = "relative_week") |> |   #  left_join(cumulative_authors, by = "relative_week") |> | ||||||
|     left_join(cumulative_committers, by = "relative_week") |> |   #  left_join(cumulative_committers, by = "relative_week") |> | ||||||
|     mutate(new_author_emails = mapply(function(x, y) length(setdiff(x, y)), author_emails, lag(cumulative_author_emails, default = list(character(1)))), |   #  mutate(new_author_emails = mapply(function(x, y) length(setdiff(x, y)), author_emails, lag(cumulative_author_emails, default = list(character(1)))), | ||||||
|            new_committer_emails = mapply(function(x, y) length(setdiff(x, y)), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))))  |   #         new_committer_emails = mapply(function(x, y) length(setdiff(x, y)), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))))  | ||||||
|    |    | ||||||
|   weekly_with_authorship <- weekly_with_authorship |> |   #weekly_with_authorship <- weekly_with_authorship |> | ||||||
|     mutate( |   #  mutate( | ||||||
|       wikimedia_author_emails = mapply(function(x) length(grep("@wikimedia.org", x)), author_emails), |   #    wikimedia_author_emails = mapply(function(x) length(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x)), author_emails), | ||||||
|       non_wikimedia_author_emails = mapply(function(x) length(x) - length(grep("@wikimedia.org", x)), author_emails), |   #    non_wikimedia_author_emails = mapply(function(x) length(x) - length(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x)), author_emails), | ||||||
|       wikimedia_committer_emails = mapply(function(x) length(grep("@wikimedia.org", x)), committer_emails), |   #    wikimedia_committer_emails = mapply(function(x) length(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x)), committer_emails), | ||||||
|       non_wikimedia_committer_emails = mapply(function(x) length(x) - length(grep("@wikimedia.org", x)), committer_emails), |   #    non_wikimedia_committer_emails = mapply(function(x) length(x) - length(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x)), committer_emails), | ||||||
|       new_wikimedia_authors = mapply(function(x, y) length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))), |   #    new_wikimedia_authors = mapply(function(x, y) length(setdiff(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x, value = TRUE), grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))), | ||||||
|       new_non_wikimedia_authors = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))), |   #    new_non_wikimedia_authors = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x, value = TRUE), grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))), | ||||||
|       new_wikimedia_committers = mapply(function(x, y) length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))), |   #    new_wikimedia_committers = mapply(function(x, y) length(setdiff(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x, value = TRUE), grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))), | ||||||
|       new_non_wikimedia_committers = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))) |   #    new_non_wikimedia_committers = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", x, value = TRUE), grep("@wikimedia|@users.mediawiki.org|@wikia-inc.com", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))) | ||||||
|     ) |> |   #  ) |> | ||||||
|     select(-author_emails, -committer_emails, -cumulative_author_emails, -cumulative_committer_emails) |  | ||||||
|    |    | ||||||
|  |   weekly_commits <- weekly_commits |> | ||||||
|  |     filter(relative_week >= (-52) & relative_week <= 52 ) | ||||||
|    |    | ||||||
|   #gracefully exit |   #gracefully exit | ||||||
|   return(weekly_with_authorship) |   return(weekly_commits) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| test <- read.csv(ve_commit_fp, header = TRUE)  |  | ||||||
| transformed <- transform_commit_data(ve_commit_fp) | transformed <- transform_commit_data(ve_commit_fp) | ||||||
| output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0312_mediawiki_core_weekly_commit_count_data.csv" | ||||||
| 
 | 
 | ||||||
| write.csv(transformed, output_filepath, row.names = FALSE) | write.csv(transformed, output_filepath, row.names = FALSE) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,17 +1,18 @@ | |||||||
| library(tidyverse) | library(tidyverse) | ||||||
| entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_mediawiki_core_weekly_commit_count_data.csv" | library(dplyr) | ||||||
|  | entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0312_extensions_ve_weekly_commit_count_data.csv" | ||||||
| entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") | entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") | ||||||
| 
 | 
 | ||||||
| widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_mediawiki_core_weekly_commit_count_data.csv" | widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0312_extensions_ve_weekly_commit_count_data.csv" | ||||||
| widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") | widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") | ||||||
| 
 | 
 | ||||||
| event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_mediawiki_core_weekly_commit_count_data.csv" | event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0312_extensions_ve_weekly_commit_count_data.csv" | ||||||
| event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | ||||||
| 
 | 
 | ||||||
| #input_df <- bind_rows(entest_df, widetest_df, event_df) | #input_df <- bind_rows(entest_df, widetest_df, event_df) | ||||||
| #dropping the event (2013-07-01) from the modeling | #dropping the event (2013-07-01) from the modeling | ||||||
| input_df <- bind_rows(entest_df, widetest_df) | #input_df <- bind_rows(entest_df, widetest_df) | ||||||
| #input_df <- event_df | input_df <- event_df | ||||||
| 
 | 
 | ||||||
| input_df <- input_df |> | input_df <- input_df |> | ||||||
|   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> |   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> | ||||||
| @ -27,7 +28,7 @@ library(dplyr) | |||||||
| 
 | 
 | ||||||
| #get into mlm format | #get into mlm format | ||||||
| long_df <- input_df |> | long_df <- input_df |> | ||||||
|   pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), |   tidyr::pivot_longer(cols = c(other_commit_count, wikimedia_commit_count, unaff_ft_commit_count), | ||||||
|                names_to = "commit_type", |                names_to = "commit_type", | ||||||
|                values_to = "lengthened_commit_count") |                values_to = "lengthened_commit_count") | ||||||
| 
 | 
 | ||||||
| @ -38,24 +39,28 @@ intermediate_long_df <- long_df |> | |||||||
| 
 | 
 | ||||||
| library(rdd) | library(rdd) | ||||||
| 
 | 
 | ||||||
| var(intermediate_long_df$lengthened_commit_count) # 1253.343 | var(intermediate_long_df$lengthened_commit_count)  | ||||||
| mean(intermediate_long_df$lengthened_commit_count) # 44.92381 | mean(intermediate_long_df$lengthened_commit_count)  | ||||||
| median(intermediate_long_df$lengthened_commit_count) # 39.5 | median(intermediate_long_df$lengthened_commit_count)  | ||||||
| 
 | 
 | ||||||
| get_optimal_bandwidth <- function(df){ | get_optimal_bandwidth <- function(df){ | ||||||
|   bw <- tryCatch({ |   bw <- tryCatch({ | ||||||
|     IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular") |     IKbandwidth(df$relative_week, df$lengthened_commit_count, cutpoint = 0, verbose = FALSE, kernel = "triangular") | ||||||
|   }, error = function(e) { |   }, error = function(e) { | ||||||
|     NA |     NA | ||||||
|   }) |   }) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | intermediate_long_df <- intermediate_long_df |> | ||||||
|  |   filter(commit_type != "unaff_ft_commit_count") | ||||||
|  | 
 | ||||||
| optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) | optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) | ||||||
| 
 | 
 | ||||||
| 
 | library(dplyr) | ||||||
| window_num <- 4 | window_num <- 10 | ||||||
| final_long_df <- intermediate_long_df |> | final_long_df <- intermediate_long_df |> | ||||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num))  |   filter(relative_week >= (-window_num) & relative_week <= window_num) |> | ||||||
|  |   filter(commit_type != "unaff_ft_commit_count") | ||||||
| 
 | 
 | ||||||
| library(fitdistrplus) | library(fitdistrplus) | ||||||
| descdist(final_long_df$lengthened_commit_count, discrete=FALSE) | descdist(final_long_df$lengthened_commit_count, discrete=FALSE) | ||||||
| @ -70,14 +75,24 @@ mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + | |||||||
|                                      optCtrl=list(maxfun=2e5)), nAGQ=0, |                                      optCtrl=list(maxfun=2e5)), nAGQ=0, | ||||||
|                 data=final_long_df) |                 data=final_long_df) | ||||||
| #(before_after*relative_week|rd_event) | #(before_after*relative_week|rd_event) | ||||||
| #mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+ | 
 | ||||||
| #                    (before_after*relative_week|commit_type) +  | saveRDS(mlm, file = "0312_ve_ve_event_commits_mlm.rds") | ||||||
| #                    (before_after*relative_week|rd_event) ,data=long_df) | #mlm <- readRDS("commit_analysis/case1/0312_core_ve_testing_commits_mlm.rds") | ||||||
| summary(mlm) | summary(mlm) | ||||||
| qqnorm(residuals(mlm)) | qqnorm(residuals(mlm)) | ||||||
| res <- ranef(mlm) | res <- ranef(mlm) | ||||||
| print(res) | print(res) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | library(broom.mixed) | ||||||
|  | library(ggplot2) | ||||||
|  | condvals <- broom.mixed::tidy(mlm, effects = "ran_vals", conf.int = TRUE) | ||||||
|  | glmer_ranef_ba <- condvals | ||||||
|  | write.csv(glmer_ranef_ba, "0312_ve_ve_event_ba_ranefs.csv") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| texreg(mlm) | texreg(mlm) | ||||||
| #final_long_df <- final_long_df |> | #final_long_df <- final_long_df |> | ||||||
| #  drop_na() | #  drop_na() | ||||||
| @ -116,34 +131,3 @@ summary(other_share_lmer) | |||||||
| qqnorm(residuals(other_share_lm)) | qqnorm(residuals(other_share_lm)) | ||||||
| 
 | 
 | ||||||
| texreg(other_share_lm) | texreg(other_share_lm) | ||||||
| 
 |  | ||||||
| #power analysis  |  | ||||||
| #library(simr) |  | ||||||
| #simrOptions(progress=FALSE) |  | ||||||
| 
 |  | ||||||
| ## Intercept and slopes for intervention, time1, time2, intervention:time1, intervention:time2 |  | ||||||
| #wmf_fixed <- c(0.511, -0.166, 0.002, 0.007) |  | ||||||
| ## Random intercepts for participants clustered by class |  | ||||||
| #wmf_rand <- matrix(c( |  | ||||||
| #  0.01,  0.005, 0.002, 0.001, |  | ||||||
| #  0.005, 0.02,  0.003, 0.004, |  | ||||||
| #  0.002, 0.003, 0.015, 0.006, |  | ||||||
| #  0.001, 0.004, 0.006, 0.01 |  | ||||||
| #), nrow=4, byrow=TRUE) |  | ||||||
| ## residual variance |  | ||||||
| #wmf_res <- 0.2065 |  | ||||||
| 
 |  | ||||||
| #wmf_model <- makeLmer(commit_share ~ before_after*relative_week + (before_after*relative_week | rd_event), |  | ||||||
| #                      fixef=wmf_fixed, VarCorr=wmf_rand, sigma=wmf_res, data=wikimedia_long_df) |  | ||||||
| 
 |  | ||||||
| #sim_treat <- powerSim(wmf_model, nsim=100, test = fcompare(commit_share~relative_week)) |  | ||||||
| #sim_treat |  | ||||||
| 
 |  | ||||||
| #model_ext_subj <- extend(wmf_model, within="rd_event+before_after+relative_week", n=30) |  | ||||||
| #sim_treat_subj <- powerSim(model_ext_subj, nsim=100, test = fcompare(commit_share~before_after*relative_week)) |  | ||||||
| #sim_treat_subj |  | ||||||
| 
 |  | ||||||
| #p_curve_treat <- powerCurve(model_ext_subj, test=fcompare(commit_share~before_after*relative_week),  |  | ||||||
| #                            within="rd_event+before_after+relative_week",  |  | ||||||
| #                            breaks=c(5,10,15,20)) |  | ||||||
| #plot(p_curve_treat) |  | ||||||
|  | |||||||
| @ -1,11 +1,11 @@ | |||||||
| library(tidyverse) | library(tidyverse) | ||||||
| entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" | entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0312_mediawiki_core_weekly_commit_count_data.csv" | ||||||
| entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") | entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") | ||||||
| 
 | 
 | ||||||
| widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv" | widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0312_mediawiki_core_weekly_commit_count_data.csv" | ||||||
| widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") | widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") | ||||||
| 
 | 
 | ||||||
| event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0312_mediawiki_core_weekly_commit_count_data.csv" | ||||||
| event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | ||||||
| 
 | 
 | ||||||
| #input_df <- bind_rows(entest_df, widetest_df, event_df) | #input_df <- bind_rows(entest_df, widetest_df, event_df) | ||||||
| @ -21,24 +21,29 @@ input_df <- input_df |> | |||||||
|   dplyr::select(-wikia_commit_count) |   dplyr::select(-wikia_commit_count) | ||||||
| 
 | 
 | ||||||
| #get into mlm format | #get into mlm format | ||||||
|  | #long_df <- input_df |> | ||||||
|  | #  tidyr::pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), | ||||||
|  | #               names_to = "commit_type", | ||||||
|  | #               values_to = "lengthened_commit_count") | ||||||
| long_df <- input_df |> | long_df <- input_df |> | ||||||
|   pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), |   tidyr::pivot_longer(cols = c(wmf_ft_commit_count, unaff_ft_commit_count, nonbot_commit_count), | ||||||
|                names_to = "commit_type", |                names_to = "commit_type", | ||||||
|                values_to = "lengthened_commit_count") |                values_to = "lengthened_commit_count") | ||||||
| 
 | 
 | ||||||
| intermediate_long_df <- long_df |> | intermediate_long_df <- long_df  | ||||||
|   mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> | #|> | ||||||
|   mutate(log_commits = log1p(lengthened_commit_count))|> | #  mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> | ||||||
|   mutate(scaled_long_commits = lengthened_commit_count / 10)  | #  mutate(log_commits = log1p(lengthened_commit_count))|> | ||||||
|  | # mutate(scaled_long_commits = lengthened_commit_count / 10)  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| window_num <- 4 | window_num <- 8 | ||||||
| final_long_df <- intermediate_long_df |> | final_long_df <- intermediate_long_df |> | ||||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num))  |   filter(relative_week >= (- window_num) & relative_week <= (window_num))  | ||||||
| 
 | 
 | ||||||
| affiliationColors <- | affiliationColors <- | ||||||
|   setNames( c('#5da2d8', '#c7756a') |   setNames( c('#5da2d8', '#c7756a', 'black') | ||||||
|             ,c("other_commit_count", "wikimedia_commit_count")) |             ,c("unaff_ft_commit_count", "wmf_ft_commit_count", "nonbot_commit_count")) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| commit_plot <- final_long_df |> | commit_plot <- final_long_df |> | ||||||
| @ -52,11 +57,14 @@ commit_plot <- final_long_df |> | |||||||
|   scale_color_manual(values = affiliationColors, |   scale_color_manual(values = affiliationColors, | ||||||
|                      labels = c("other_commit_count" = "Unaffiliated", "wikimedia_commit_count" = "WMF Affiliated")) +  |                      labels = c("other_commit_count" = "Unaffiliated", "wikimedia_commit_count" = "WMF Affiliated")) +  | ||||||
|   scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +  |   scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +  | ||||||
|   ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events (by Affiliation)") + |   ggtitle("VisualEditor New Contributors Commits Around Opt-In Testing Events (by Affiliation)") + | ||||||
|   theme_bw() +  |   theme_bw() +  | ||||||
|   theme(legend.position = "top") |   theme(legend.position = "top") | ||||||
| commit_plot | commit_plot | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | ggsave(filename = "0312-core-ve-testing-new-commits.png", plot = commit_plot, width = 15, height = 9, dpi = 800) | ||||||
|  | 
 | ||||||
| total_commit_plot <- final_long_df |> | total_commit_plot <- final_long_df |> | ||||||
|   ggplot(aes(x=relative_week,  |   ggplot(aes(x=relative_week,  | ||||||
|              y=nonbot_commit_count,  |              y=nonbot_commit_count,  | ||||||
| @ -74,8 +82,6 @@ total_commit_plot | |||||||
| ggsave(filename = "0305-ve-total-commits.png", plot = total_commit_plot, width = 15, height = 9, dpi = 800) | ggsave(filename = "0305-ve-total-commits.png", plot = total_commit_plot, width = 15, height = 9, dpi = 800) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| final_long_df <- final_long_df|> |  | ||||||
|   drop_na() |  | ||||||
| 
 | 
 | ||||||
| commit_share_plot <- final_long_df |> | commit_share_plot <- final_long_df |> | ||||||
|   ggplot(aes(x=relative_week,  |   ggplot(aes(x=relative_week,  | ||||||
| @ -93,4 +99,4 @@ commit_share_plot <- final_long_df |> | |||||||
|   theme(legend.position = "top") |   theme(legend.position = "top") | ||||||
| commit_share_plot | commit_share_plot | ||||||
| 
 | 
 | ||||||
| ggsave(filename = "0305-ve-testing-share.png", plot = commit_share_plot, width = 12, height = 9, dpi = 800) | ggsave(filename = "0312-ve-testing-share.png", plot = commit_share_plot, width = 12, height = 9, dpi = 800) | ||||||
|  | |||||||
							
								
								
									
										17
									
								
								mgaughan-rstudio-server_24802792.out
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								mgaughan-rstudio-server_24802792.out
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | |||||||
|  | 1. SSH tunnel from your workstation using the following command: | ||||||
|  | 
 | ||||||
|  |    ssh -N -L 8787:n3439:57439 mjilg@klone.hyak.uw.edu | ||||||
|  | 
 | ||||||
|  |    and point your web browser to http://localhost:8787 | ||||||
|  | 
 | ||||||
|  | 2. log in to RStudio Server using the following credentials: | ||||||
|  | 
 | ||||||
|  |    user: mjilg | ||||||
|  |    password: Q5pJ0QUFUjx7RBFTzpvm | ||||||
|  | 
 | ||||||
|  | When done using RStudio Server, terminate the job by: | ||||||
|  | 
 | ||||||
|  | 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) | ||||||
|  | 2. Issue the following command on the login node: | ||||||
|  | 
 | ||||||
|  |       scancel -f 24802792 | ||||||
| @ -0,0 +1,219 @@ | |||||||
|  | { | ||||||
|  |  "cells": [ | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 2, | ||||||
|  |    "id": "b270bd36-529e-4595-a780-ef6c8151c31f", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stderr", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n", | ||||||
|  |       "  warnings.warn(\"Can't initialize NVML\")\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "import pandas as pd \n", | ||||||
|  |     "import spacy" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 3, | ||||||
|  |    "id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n", | ||||||
|  |     "phab_df = pd.read_csv(phab_path)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 4, | ||||||
|  |    "id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stderr", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | ||||||
|  |       "  from .autonotebook import tqdm as notebook_tqdm\n", | ||||||
|  |       "/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/spacy/util.py:910: UserWarning: [W095] Model 'en_coreference_web_trf' (3.4.0a2) was trained with spaCy v3.3.0 and may not be 100% compatible with the current version (3.7.5). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n", | ||||||
|  |       "  warnings.warn(warn_msg)\n" | ||||||
|  |      ] | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495edce13c0>" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 4, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "nlp = spacy.load(\"en_core_web_trf\")\n", | ||||||
|  |     "nlp_coref = spacy.load(\"en_coreference_web_trf\")\n", | ||||||
|  |     "\n", | ||||||
|  |     "# use replace_listeners for the coref components\n", | ||||||
|  |     "nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n", | ||||||
|  |     "nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n", | ||||||
|  |     "\n", | ||||||
|  |     "# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n", | ||||||
|  |     "nlp.add_pipe(\"merge_entities\")\n", | ||||||
|  |     "nlp.add_pipe(\"coref\", source=nlp_coref)\n", | ||||||
|  |     "nlp.add_pipe(\"span_resolver\", source=nlp_coref)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 57, | ||||||
|  |    "id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# https://github.com/explosion/spaCy/discussions/13572\n", | ||||||
|  |     "# https://github.com/explosion/spaCy/issues/13111 \n", | ||||||
|  |     "# https://explosion.ai/blog/coref\n", | ||||||
|  |     "# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n", | ||||||
|  |     "doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 71, | ||||||
|  |    "id": "999e1656-0036-4ba2-bedf-f54493f67790", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n", | ||||||
|  |     "from spacy.tokens import Doc\n", | ||||||
|  |     "# Define lightweight function for resolving references in text\n", | ||||||
|  |     "def resolve_references(doc: Doc) -> str:\n", | ||||||
|  |     "    \"\"\"Function for resolving references with the coref ouput\n", | ||||||
|  |     "    doc (Doc): The Doc object processed by the coref pipeline\n", | ||||||
|  |     "    RETURNS (str): The Doc string with resolved references\n", | ||||||
|  |     "    \"\"\"\n", | ||||||
|  |     "    # token.idx : token.text\n", | ||||||
|  |     "    token_mention_mapper = {}\n", | ||||||
|  |     "    output_string = \"\"\n", | ||||||
|  |     "    clusters = [\n", | ||||||
|  |     "        val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n", | ||||||
|  |     "    ]\n", | ||||||
|  |     "\n", | ||||||
|  |     "    # Iterate through every found cluster\n", | ||||||
|  |     "    for cluster in clusters:\n", | ||||||
|  |     "        first_mention = cluster[0]\n", | ||||||
|  |     "        # Iterate through every other span in the cluster\n", | ||||||
|  |     "        for mention_span in list(cluster)[1:]:\n", | ||||||
|  |     "            # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n", | ||||||
|  |     "            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n", | ||||||
|  |     "            \n", | ||||||
|  |     "            for token in mention_span[1:]:\n", | ||||||
|  |     "                # Set empty string for all the other tokens in mention_span\n", | ||||||
|  |     "                token_mention_mapper[token.idx] = \"\"\n", | ||||||
|  |     "\n", | ||||||
|  |     "    # Iterate through every token in the Doc\n", | ||||||
|  |     "    for token in doc:\n", | ||||||
|  |     "        # Check if token exists in token_mention_mapper\n", | ||||||
|  |     "        if token.idx in token_mention_mapper:\n", | ||||||
|  |     "            output_string += token_mention_mapper[token.idx]\n", | ||||||
|  |     "        # Else add original token text\n", | ||||||
|  |     "        else:\n", | ||||||
|  |     "            output_string += token.text + token.whitespace_\n", | ||||||
|  |     "\n", | ||||||
|  |     "    return output_string\n" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 72, | ||||||
|  |    "id": "be476647-624b-4e95-ab62-9c6b08f85368", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "def resolving_comment(text):\n", | ||||||
|  |     "    doc = nlp(text)\n", | ||||||
|  |     "    resolved_text = resolve_references(doc)\n", | ||||||
|  |     "    return resolved_text" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 73, | ||||||
|  |    "id": "a9628b54-a1df-49cd-a365-9cba59de3421", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "'i hate ve.interface, ve.interface always messes up i browser'" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 73, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "resolving_comment(\"i hate ve.interface, it always messes up my browser\")" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "46873641-8e88-4829-9e24-4dd5e6749bd1", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stderr", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/thinc/shims/pytorch.py:114: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n", | ||||||
|  |       "  with torch.cuda.amp.autocast(self._mixed_precision):\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "phab_df['text'] = phab_df['comment_text'].apply(str)\n", | ||||||
|  |     "phab_df['resolved_text'] = phab_df['text'].apply(resolving_comment)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [] | ||||||
|  |   } | ||||||
|  |  ], | ||||||
|  |  "metadata": { | ||||||
|  |   "kernelspec": { | ||||||
|  |    "display_name": "Python 3 (ipykernel)", | ||||||
|  |    "language": "python", | ||||||
|  |    "name": "python3" | ||||||
|  |   }, | ||||||
|  |   "language_info": { | ||||||
|  |    "codemirror_mode": { | ||||||
|  |     "name": "ipython", | ||||||
|  |     "version": 3 | ||||||
|  |    }, | ||||||
|  |    "file_extension": ".py", | ||||||
|  |    "mimetype": "text/x-python", | ||||||
|  |    "name": "python", | ||||||
|  |    "nbconvert_exporter": "python", | ||||||
|  |    "pygments_lexer": "ipython3", | ||||||
|  |    "version": "3.10.16" | ||||||
|  |   } | ||||||
|  |  }, | ||||||
|  |  "nbformat": 4, | ||||||
|  |  "nbformat_minor": 5 | ||||||
|  | } | ||||||
| @ -837,7 +837,7 @@ | |||||||
|    "name": "python", |    "name": "python", | ||||||
|    "nbconvert_exporter": "python", |    "nbconvert_exporter": "python", | ||||||
|    "pygments_lexer": "ipython3", |    "pygments_lexer": "ipython3", | ||||||
|    "version": "3.9.21" |    "version": "3.10.16" | ||||||
|   } |   } | ||||||
|  }, |  }, | ||||||
|  "nbformat": 4, |  "nbformat": 4, | ||||||
|  | |||||||
							
								
								
									
										219
									
								
								text_analysis/case1/coref_resolution.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										219
									
								
								text_analysis/case1/coref_resolution.ipynb
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,219 @@ | |||||||
|  | { | ||||||
|  |  "cells": [ | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 2, | ||||||
|  |    "id": "b270bd36-529e-4595-a780-ef6c8151c31f", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stderr", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n", | ||||||
|  |       "  warnings.warn(\"Can't initialize NVML\")\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "import pandas as pd \n", | ||||||
|  |     "import spacy" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 3, | ||||||
|  |    "id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n", | ||||||
|  |     "phab_df = pd.read_csv(phab_path)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 4, | ||||||
|  |    "id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stderr", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | ||||||
|  |       "  from .autonotebook import tqdm as notebook_tqdm\n", | ||||||
|  |       "/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/spacy/util.py:910: UserWarning: [W095] Model 'en_coreference_web_trf' (3.4.0a2) was trained with spaCy v3.3.0 and may not be 100% compatible with the current version (3.7.5). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n", | ||||||
|  |       "  warnings.warn(warn_msg)\n" | ||||||
|  |      ] | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495edce13c0>" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 4, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "nlp = spacy.load(\"en_core_web_trf\")\n", | ||||||
|  |     "nlp_coref = spacy.load(\"en_coreference_web_trf\")\n", | ||||||
|  |     "\n", | ||||||
|  |     "# use replace_listeners for the coref components\n", | ||||||
|  |     "nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n", | ||||||
|  |     "nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n", | ||||||
|  |     "\n", | ||||||
|  |     "# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n", | ||||||
|  |     "nlp.add_pipe(\"merge_entities\")\n", | ||||||
|  |     "nlp.add_pipe(\"coref\", source=nlp_coref)\n", | ||||||
|  |     "nlp.add_pipe(\"span_resolver\", source=nlp_coref)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 57, | ||||||
|  |    "id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# https://github.com/explosion/spaCy/discussions/13572\n", | ||||||
|  |     "# https://github.com/explosion/spaCy/issues/13111 \n", | ||||||
|  |     "# https://explosion.ai/blog/coref\n", | ||||||
|  |     "# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n", | ||||||
|  |     "doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 71, | ||||||
|  |    "id": "999e1656-0036-4ba2-bedf-f54493f67790", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n", | ||||||
|  |     "from spacy.tokens import Doc\n", | ||||||
|  |     "# Define lightweight function for resolving references in text\n", | ||||||
|  |     "def resolve_references(doc: Doc) -> str:\n", | ||||||
|  |     "    \"\"\"Function for resolving references with the coref ouput\n", | ||||||
|  |     "    doc (Doc): The Doc object processed by the coref pipeline\n", | ||||||
|  |     "    RETURNS (str): The Doc string with resolved references\n", | ||||||
|  |     "    \"\"\"\n", | ||||||
|  |     "    # token.idx : token.text\n", | ||||||
|  |     "    token_mention_mapper = {}\n", | ||||||
|  |     "    output_string = \"\"\n", | ||||||
|  |     "    clusters = [\n", | ||||||
|  |     "        val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n", | ||||||
|  |     "    ]\n", | ||||||
|  |     "\n", | ||||||
|  |     "    # Iterate through every found cluster\n", | ||||||
|  |     "    for cluster in clusters:\n", | ||||||
|  |     "        first_mention = cluster[0]\n", | ||||||
|  |     "        # Iterate through every other span in the cluster\n", | ||||||
|  |     "        for mention_span in list(cluster)[1:]:\n", | ||||||
|  |     "            # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n", | ||||||
|  |     "            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n", | ||||||
|  |     "            \n", | ||||||
|  |     "            for token in mention_span[1:]:\n", | ||||||
|  |     "                # Set empty string for all the other tokens in mention_span\n", | ||||||
|  |     "                token_mention_mapper[token.idx] = \"\"\n", | ||||||
|  |     "\n", | ||||||
|  |     "    # Iterate through every token in the Doc\n", | ||||||
|  |     "    for token in doc:\n", | ||||||
|  |     "        # Check if token exists in token_mention_mapper\n", | ||||||
|  |     "        if token.idx in token_mention_mapper:\n", | ||||||
|  |     "            output_string += token_mention_mapper[token.idx]\n", | ||||||
|  |     "        # Else add original token text\n", | ||||||
|  |     "        else:\n", | ||||||
|  |     "            output_string += token.text + token.whitespace_\n", | ||||||
|  |     "\n", | ||||||
|  |     "    return output_string\n" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 72, | ||||||
|  |    "id": "be476647-624b-4e95-ab62-9c6b08f85368", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "def resolving_comment(text):\n", | ||||||
|  |     "    doc = nlp(text)\n", | ||||||
|  |     "    resolved_text = resolve_references(doc)\n", | ||||||
|  |     "    return resolved_text" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 73, | ||||||
|  |    "id": "a9628b54-a1df-49cd-a365-9cba59de3421", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "'i hate ve.interface, ve.interface always messes up i browser'" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 73, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "resolving_comment(\"i hate ve.interface, it always messes up my browser\")" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "46873641-8e88-4829-9e24-4dd5e6749bd1", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stderr", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/thinc/shims/pytorch.py:114: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n", | ||||||
|  |       "  with torch.cuda.amp.autocast(self._mixed_precision):\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "phab_df['text'] = phab_df['comment_text'].apply(str)\n", | ||||||
|  |     "phab_df['resolved_text'] = phab_df['text'].apply(resolving_comment)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [] | ||||||
|  |   } | ||||||
|  |  ], | ||||||
|  |  "metadata": { | ||||||
|  |   "kernelspec": { | ||||||
|  |    "display_name": "Python 3 (ipykernel)", | ||||||
|  |    "language": "python", | ||||||
|  |    "name": "python3" | ||||||
|  |   }, | ||||||
|  |   "language_info": { | ||||||
|  |    "codemirror_mode": { | ||||||
|  |     "name": "ipython", | ||||||
|  |     "version": 3 | ||||||
|  |    }, | ||||||
|  |    "file_extension": ".py", | ||||||
|  |    "mimetype": "text/x-python", | ||||||
|  |    "name": "python", | ||||||
|  |    "nbconvert_exporter": "python", | ||||||
|  |    "pygments_lexer": "ipython3", | ||||||
|  |    "version": "3.10.16" | ||||||
|  |   } | ||||||
|  |  }, | ||||||
|  |  "nbformat": 4, | ||||||
|  |  "nbformat_minor": 5 | ||||||
|  | } | ||||||
| @ -1811,7 +1811,7 @@ | |||||||
|    "name": "python", |    "name": "python", | ||||||
|    "nbconvert_exporter": "python", |    "nbconvert_exporter": "python", | ||||||
|    "pygments_lexer": "ipython3", |    "pygments_lexer": "ipython3", | ||||||
|    "version": "3.9.21" |    "version": "3.10.16" | ||||||
|   } |   } | ||||||
|  }, |  }, | ||||||
|  "nbformat": 4, |  "nbformat": 4, | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user