diff --git a/.sh_history b/.sh_history index ec57dbd..f09a142 100644 --- a/.sh_history +++ b/.sh_history @@ -13,3 +13,7 @@ ls rm event_0215_ve_weekly_commit_count_data.csv rm announcement_0215_ve_weekly_commit_count_data.csv ls +cd ~ +ls +ls .local +rm -r -f .local diff --git a/.wget-hsts b/.wget-hsts new file mode 100644 index 0000000..e83e13d --- /dev/null +++ b/.wget-hsts @@ -0,0 +1,4 @@ +# HSTS 1.0 Known Hosts database for GNU Wget. +# Edit at your own risk. +# +www.python.org 0 1 1740094792 63072000 diff --git a/bot_activity_analysis/bot_activity_exploration.R b/bot_activity_analysis/bot_activity_exploration.R index aff7b3b..6bc4e43 100644 --- a/bot_activity_analysis/bot_activity_exploration.R +++ b/bot_activity_analysis/bot_activity_exploration.R @@ -60,7 +60,7 @@ count_var_to_mean <- count_var / count_mean #2262140.471 # the mean count values for each day wiki_summary_df <- combined_df |> - filter(date >= as.Date("2013-01-01") & date <= as.Date("2013-12-31")) |> + filter(date >= as.Date("2012-12-01") & date <= as.Date("2013-09-30")) |> filter(wiki_db == "enwiki") |> group_by(date) |> summarize( @@ -70,8 +70,10 @@ wiki_summary_df <- combined_df |> #plotting it p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) + geom_line(color = "blue") + # Line plot # Points on the line - geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") + - labs(title = "enwiki Total Bot Actions", + geom_vline(xintercept = as.Date("2013-07-02"), linetype = "dashed", color = "black") + + geom_vline(xintercept = as.Date("2013-04-28"), linetype = "dashed", color = "black") + + geom_vline(xintercept = as.Date("2012-12-12"), linetype = "dashed", color = "black") + + labs(title = "enwiki Bot Actions 2012-12 to 2013-10", x = "Date (daily)", y = "Action Count") + theme_minimal() diff --git a/commit_analysis/commit_count_collation.R b/commit_analysis/commit_count_collation.R index 177d226..a9f55bf 100644 --- a/commit_analysis/commit_count_collation.R +++ b/commit_analysis/commit_count_collation.R @@ -3,7 +3,7 @@ library(dplyr) library(lubridate) library(tidyr) -ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/mediawiki_core_commits.csv" +ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/extensions_visualeditor_commits.csv" transform_commit_data <- function(filepath){ #basic, loading in the file @@ -14,8 +14,9 @@ transform_commit_data <- function(filepath){ # TODO: this is project/event specific - #event_date <- as.Date("2013-07-01") - event_date <- as.Date("2013-06-06") + event_date <- as.Date("2013-07-01") + #event_date <- as.Date("2013-04-28") + #event_date <- as.Date("2012-12-12") # isolate project id project_id <- sub("_.*$", "", file_name) @@ -68,7 +69,8 @@ transform_commit_data <- function(filepath){ #now cut out the commit data that we don't care about df <- df |> - filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date) + filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date) |> + filter(author_email != "jenkins-bot@gerrit.wikimedia.org") #in order: # - we group by project, week, ages @@ -85,7 +87,6 @@ transform_commit_data <- function(filepath){ wikimedia_commit_count = sum(grepl("@wikimedia.org|@wikimedia.de", author_email)), wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)), bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)), - jenkins_commit_count = sum(grepl("jenkins-bot@gerrit.wikimedia.org|gerrit@wikimedia.org", author_email)), .groups = 'drop') |> right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |> replace_na(list(commit_count = 0)) |> @@ -131,7 +132,7 @@ transform_commit_data <- function(filepath){ test <- read.csv(ve_commit_fp, header = TRUE) transformed <- transform_commit_data(ve_commit_fp) -output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/announcement_0215_core_weekly_commit_count_data.csv" +output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" write.csv(transformed, output_filepath, row.names = FALSE) diff --git a/commit_analysis/commit_plotting.R b/commit_analysis/commit_plotting.R index 2fac642..76fb1d6 100644 --- a/commit_analysis/commit_plotting.R +++ b/commit_analysis/commit_plotting.R @@ -1,19 +1,24 @@ library(tidyverse) -count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0215_ve_weekly_commit_count_data.csv" +count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" input_df <- read.csv(count_data_fp, header = TRUE) input_df$nonbot_commit_count <- input_df$commit_count - input_df$bot_commit_count -window_num <- 19 +window_num <- 52 input_df <- input_df |> - filter(relative_week >= (- window_num) & relative_week <= (window_num)) + filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> + mutate(nonbot_commit_count = commit_count - bot_commit_count)|> + mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> + mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> + select(-mediawiki_dev_commit_count) |> + select(-wikia_commit_count) library(scales) library(ggplot2) time_plot <- input_df |> - ggplot(aes(x=relative_week, y=wikimedia_commit_count)) + - labs(x="Weekly Offset", y="Wikimedia Commit Count") + + ggplot(aes(x=relative_week, y=nonbot_commit_count)) + + labs(x="Weekly Offset", y="Nonbot Commit Count") + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + @@ -24,22 +29,24 @@ library(dplyr) share_df <- input_df |> mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |> - mutate(wikia_share = wikia_commit_count / nonbot_commit_count) |> - mutate(gerrit_share = jenkins_commit_count / nonbot_commit_count) |> - mutate(mw_dev_share = mediawiki_dev_commit_count / nonbot_commit_count) |> - mutate(other_share = (nonbot_commit_count - jenkins_commit_count - wikia_commit_count - wikimedia_commit_count - mediawiki_dev_commit_count) / nonbot_commit_count)|> + mutate(other_share = other_commit_count / nonbot_commit_count)|> drop_na() share_long <- share_df |> - dplyr::select(relative_week, wikimedia_share, wikia_share, gerrit_share, mw_dev_share, other_share) |> - pivot_longer(cols = c(wikimedia_share, wikia_share, gerrit_share, mw_dev_share, other_share), names_to = "category", values_to = "share") + dplyr::select(relative_week, wikimedia_share, other_share) |> + pivot_longer(cols = c(wikimedia_share, other_share), names_to = "category", values_to = "share") share_plot <- share_long |> ggplot(aes(x=relative_week, y=share, color=category)) + - geom_smooth() + + geom_line() + geom_vline(xintercept = 0)+ + annotate("text", x = -7, y=1, label = "2012-12-12") + + geom_vline(xintercept = 19)+ + annotate("text", x = 12, y=1, label = "2013-04-28") + + geom_vline(xintercept = 28)+ + annotate("text", x = 35, y=1, label = "2013-07-01") + labs(x = "Relative Week", y = "Share of Nonbot Commit Count", color = "Affiliation") + - ggtitle("Weekly Share of Nonbot Commit Count by Affiliation") + + ggtitle("VE Weekly Share of Nonbot Commit Count by Affiliation (enwiki opt-in testing 2012-12-12)") + theme_bw() + theme(legend.position = "top") share_plot diff --git a/commit_analysis/matched_rdd_models.R b/commit_analysis/matched_rdd_models.R new file mode 100644 index 0000000..9f11398 --- /dev/null +++ b/commit_analysis/matched_rdd_models.R @@ -0,0 +1,132 @@ +library(tidyverse) +entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" +entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") + +widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv" +widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") + +event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" +event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") + +input_df <- bind_rows(entest_df, widetest_df, event_df) +#input_df <- bind_rows(entest_df, widetest_df) + +input_df <- input_df |> + mutate(nonbot_commit_count = commit_count - bot_commit_count)|> + mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> + mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> + dplyr::select(-mediawiki_dev_commit_count) |> + dplyr::select(-wikia_commit_count) + +library(MASS) + +library(lme4) +library(dplyr) + +#get into mlm format +long_df <- input_df |> + pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), + names_to = "commit_type", + values_to = "lengthened_commit_count") + +intermediate_long_df <- long_df |> + mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> + mutate(log_commits = log1p(lengthened_commit_count))|> + mutate(scaled_long_commits = lengthened_commit_count / 10) + +library(rdd) + +intermediate_long_df <- intermediate_long_df |> + drop_na() + +var(intermediate_long_df$commit_share) # 1253.343 +mean(intermediate_long_df$commit_share) # 44.92381 +median(intermediate_long_df$commit_share) # 39.5 + +get_optimal_bandwidth <- function(df){ + bw <- tryCatch({ + IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular") + }, error = function(e) { + NA + }) +} + +optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) + + +window_num <- 18 +final_long_df <- intermediate_long_df |> + filter(relative_week >= (- window_num) & relative_week <= (window_num)) + +library(fitdistrplus) +descdist(long_df$lengthened_commit_count, discrete=FALSE) +#start_values <- list(shape1 = 1, shape2 = 1) +#fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial") +print(fit) + +#NOTE should not run if you've already dropped NA +mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + + (before_after*relative_week|commit_type) + + (before_after*relative_week|rd_event), + control=glmerControl(optimizer="bobyqa", + optCtrl=list(maxfun=2e5)), nAGQ=0, + data=long_df) +#mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+ +# (before_after*relative_week|commit_type) + +# (before_after*relative_week|rd_event) ,data=long_df) +summary(mlm) +qqnorm(residuals(mlm)) +res <- ranef(mlm) +print(res) + +#final_long_df <- final_long_df |> +# drop_na() +library(performance) +#descdist(long_df$commit_share, discrete=FALSE) +#fit <- MASS::fitdistr(as.numeric(long_df$commit_share), "normal") +#print(fit) +wikimedia_long_df <- final_long_df |> + filter(commit_type == "wikimedia_commit_count") +wikimedia_share_lmer <- lmer(commit_share ~ before_after*relative_week + + (1| rd_event), + data=wikimedia_long_df) +summary(wikimedia_share_lmer) +icc(wikimedia_share_lmer) + +other_long_df <- final_long_df |> + filter(commit_type == "other_commit_count") +other_share_lmer <- lm(commit_share ~ before_after*relative_week, + data=other_long_df) +summary(other_share_lmer) +icc(other_share_lmer) + +#power analysis +#library(simr) +#simrOptions(progress=FALSE) + +## Intercept and slopes for intervention, time1, time2, intervention:time1, intervention:time2 +#wmf_fixed <- c(0.511, -0.166, 0.002, 0.007) +## Random intercepts for participants clustered by class +#wmf_rand <- matrix(c( +# 0.01, 0.005, 0.002, 0.001, +# 0.005, 0.02, 0.003, 0.004, +# 0.002, 0.003, 0.015, 0.006, +# 0.001, 0.004, 0.006, 0.01 +#), nrow=4, byrow=TRUE) +## residual variance +#wmf_res <- 0.2065 + +#wmf_model <- makeLmer(commit_share ~ before_after*relative_week + (before_after*relative_week | rd_event), +# fixef=wmf_fixed, VarCorr=wmf_rand, sigma=wmf_res, data=wikimedia_long_df) + +#sim_treat <- powerSim(wmf_model, nsim=100, test = fcompare(commit_share~relative_week)) +#sim_treat + +#model_ext_subj <- extend(wmf_model, within="rd_event+before_after+relative_week", n=30) +#sim_treat_subj <- powerSim(model_ext_subj, nsim=100, test = fcompare(commit_share~before_after*relative_week)) +#sim_treat_subj + +#p_curve_treat <- powerCurve(model_ext_subj, test=fcompare(commit_share~before_after*relative_week), +# within="rd_event+before_after+relative_week", +# breaks=c(5,10,15,20)) +#plot(p_curve_treat) diff --git a/commit_analysis/models.R b/commit_analysis/models.R index 932abcb..bba5e29 100644 --- a/commit_analysis/models.R +++ b/commit_analysis/models.R @@ -1,6 +1,6 @@ library(tidyverse) -count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0215_core_weekly_commit_count_data.csv" -input_df <- read.csv(count_data_fp, header = TRUE) +count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" +input_df <- read.csv(count_data_fp, header = TRUE) library(rdd) @@ -18,34 +18,53 @@ get_optimal_bandwidth <- function(df){ optimal_bandwidth <- get_optimal_bandwidth(input_df) -window_num <- 13 +window_num <- 8 input_df <- input_df |> filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> - mutate(other_commit_count = commit_count - bot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count - jenkins_commit_count) + mutate(nonbot_commit_count = commit_count - bot_commit_count)|> + mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> + mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> + select(-mediawiki_dev_commit_count) |> + select(-wikia_commit_count) -library(MASS) +#library(MASS) -simple_model <- glm.nb(commit_count~before_after*relative_week, data=input_df) -summary(simple_model) +#simple_model <- glm.nb(commit_count~before_after*relative_week, data=input_df) +#summary(simple_model) library(lme4) library(dplyr) #get into mlm format long_df <- input_df |> - pivot_longer(cols = c(other_commit_count, wikimedia_commit_count, jenkins_commit_count, wikia_commit_count, mediawiki_dev_commit_count), + pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), names_to = "commit_type", values_to = "lengthened_commit_count") long_df <- long_df |> - mutate(commit_share = lengthened_commit_count / (commit_count - bot_commit_count)) |> + mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> mutate(log_commits = log1p(lengthened_commit_count)) -mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + (before_after*relative_week|commit_type), +mlm <- glmer.nb(log_commits ~ before_after*relative_week + (before_after*relative_week|commit_type), control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), nAGQ=0, data=long_df) summary(mlm) ranefs <- ranef(mlm) print(ranefs) -saveRDS(mlm, "021525_core-ve_event_mlm.rda") +#saveRDS(mlm, "021525_core-ve_event_mlm.rda") + +share_df <- input_df |> + mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |> + mutate(other_share = other_commit_count / nonbot_commit_count)|> + drop_na() + +share_long <- share_df |> + dplyr::select(relative_week, wikimedia_share, other_share, before_after) |> + pivot_longer(cols = c(wikimedia_share, other_share), names_to = "category", values_to = "share") + +share_mlm <- glmer.nb(share ~ before_after*relative_week + (before_after*relative_week|category), + control=glmerControl(optimizer="bobyqa", + optCtrl=list(maxfun=2e5)), nAGQ=0, + data=share_long) +summary(share_mlm) diff --git a/english-ewt-ud-2.5-191206.udpipe b/english-ewt-ud-2.5-191206.udpipe new file mode 100644 index 0000000..7f16e14 Binary files /dev/null and b/english-ewt-ud-2.5-191206.udpipe differ diff --git a/gerrit_analysis/plotting_gerrit.R b/gerrit_analysis/plotting_gerrit.R index 7322f5e..5ddad2f 100644 --- a/gerrit_analysis/plotting_gerrit.R +++ b/gerrit_analysis/plotting_gerrit.R @@ -4,11 +4,10 @@ input_df <- read.csv(count_data_fp, header = TRUE) #window_num <- 19 -window_num <- 52 +window_num <- 26 input_df <- input_df |> filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> - mutate(parent_projects = if_else(project == "mediawiki/extensions/VisualEditor", - "mediawiki/extensions", project)) + mutate(parent_projects = project) library(scales) library(ggplot2) @@ -17,6 +16,7 @@ time_plot <- input_df |> ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") + geom_smooth() + + geom_point() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") @@ -28,7 +28,8 @@ abandoned_df <- input_df |> time_plot <- abandoned_df |> ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") + - geom_line() + + geom_smooth() + + geom_point() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") @@ -37,7 +38,8 @@ time_plot delta_df <- input_df |> filter(task_count != 0) |> - filter(relative_week >= (- 12)) + filter(relative_week >= (- 12)) |> + filter(status != "ABANDONED") time_plot <- delta_df |> ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) + labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") + @@ -49,11 +51,12 @@ time_plot loc_df <- input_df |> + filter(relative_week >= (- 19) & relative_week <= (19)) |> filter(task_count != 0) |> filter(status != "ABANDONED") time_plot <- loc_df |> - ggplot(aes(x=relative_week, y=avg_deletions, color=parent_projects)) + - labs(x="Weekly Offset", y="Avg. LOC Deleted per Accepted Gerrit Task", color = "Project") + + ggplot(aes(x=relative_week, y=avg_insertions, color=parent_projects)) + + labs(x="Weekly Offset", y="Avg. LOC Inserted per Accepted Gerrit Task", color = "Project") + geom_line() + geom_vline(xintercept = 0)+ theme_bw() + diff --git a/mgaughan-rstudio-server_24372601.out b/mgaughan-rstudio-server_24372601.out new file mode 100644 index 0000000..c9a9b75 --- /dev/null +++ b/mgaughan-rstudio-server_24372601.out @@ -0,0 +1,18 @@ +1. SSH tunnel from your workstation using the following command: + + ssh -N -L 8787:n3441:34585 mjilg@klone.hyak.uw.edu + + and point your web browser to http://localhost:8787 + +2. log in to RStudio Server using the following credentials: + + user: mjilg + password: WoborOUQ79MgRq898+pw + +When done using RStudio Server, terminate the job by: + +1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) +2. Issue the following command on the login node: + + scancel -f 24372601 +slurmstepd: error: *** JOB 24372601 ON n3441 CANCELLED AT 2025-02-20T15:43:50 *** diff --git a/text_analysis/longitudinal_analysis.R b/text_analysis/longitudinal_analysis.R index 7dd944d..dd64143 100644 --- a/text_analysis/longitudinal_analysis.R +++ b/text_analysis/longitudinal_analysis.R @@ -1,15 +1,48 @@ library(dplyr) library(ggplot2) -phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/0205_convo_data/phab_data/visualeditor/0205_ve_phab_comments.csv" +phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv" phab_data <- read.csv(phab_data_path, header=TRUE) phab_data <- phab_data |> - mutate(has_ref = grepl("bots", comment_text)) |> + mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |> + mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |> mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |> - filter(date_created < 1518232866 & date_created > 1356923678) + mutate(comment_id = row_number())|> + filter(date_created < 1383264000 & date_created > 1351728000) +#looking at all data between 11-1-2012 and 11-1-2013 + +length(unique(phab_data$date_created)) + +#g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) + +# geom_point(alpha = 0.5) + +# theme_minimal() +#g + +library(udpipe) +#library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file + +library(tidytext) +library(dplyr) +library(stringr) + +# we first need to transform our comment level of analysis into sentences +sentence_level_data <- phab_data |> + unnest_tokens(sentence, comment_text, token = "sentences") |> + group_by(comment_id) |> + mutate(sentence_id = row_number())|> + dplyr::select(-has_bot_ref, -has_ref)|> + mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |> + mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |> + ungroup() -g <- ggplot(phab_data, aes(x=timestamp, y=has_ref)) + - geom_point(alpha = 0.5) + - theme_minimal() -g +library(udpipe) +library(rsyntax) +# Load necessary libraries +library(spacyr) +spacy_install() +#we only care about stuff that mentions VE rn, then tokenize +sentence_level_data <- sentence_level_data |> + filter(has_ref == TRUE) |> + mutate(sentence_tokens = udpipe(sentence, "english")) +