library(dplyr) library(ggplot2) phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv" phab_data <- read.csv(phab_data_path, header=TRUE) phab_data <- phab_data |> mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |> mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |> mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |> mutate(comment_id = row_number())|> filter(date_created < 1383264000 & date_created > 1351728000) #looking at all data between 11-1-2012 and 11-1-2013 length(unique(phab_data$date_created)) #g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) + # geom_point(alpha = 0.5) + # theme_minimal() #g library(udpipe) #library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file library(tidytext) library(dplyr) library(stringr) # we first need to transform our comment level of analysis into sentences sentence_level_data <- phab_data |> unnest_tokens(sentence, comment_text, token = "sentences") |> group_by(comment_id) |> mutate(sentence_id = row_number())|> dplyr::select(-has_bot_ref, -has_ref)|> mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |> mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |> ungroup() library(udpipe) library(rsyntax) # Load necessary libraries library(spacyr) spacy_install() #we only care about stuff that mentions VE rn, then tokenize sentence_level_data <- sentence_level_data |> filter(has_ref == TRUE) |> mutate(sentence_tokens = udpipe(sentence, "english"))