library(dplyr)
library(ggplot2)
phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv"
phab_data <- read.csv(phab_data_path, header=TRUE)

phab_data <- phab_data |>
  mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |>
  mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |>
  mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
  mutate(comment_id = row_number())|>
  filter(date_created < 1383264000 & date_created > 1351728000)
#looking at all data between 11-1-2012 and 11-1-2013

length(unique(phab_data$date_created))

#g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) +
#  geom_point(alpha = 0.5) + 
#  theme_minimal()
#g

library(udpipe)
#library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file

library(tidytext)
library(dplyr)
library(stringr)

# we first need to transform our comment level of analysis into sentences
sentence_level_data <- phab_data |>
  unnest_tokens(sentence, comment_text, token = "sentences") |>
  group_by(comment_id) |>
  mutate(sentence_id = row_number())|>
  dplyr::select(-has_bot_ref, -has_ref)|>
  mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |>
  mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |>
  ungroup()


library(udpipe)
library(rsyntax)
# Load necessary libraries
library(spacyr)
spacy_install()
#we only care about stuff that mentions VE rn, then tokenize
sentence_level_data <- sentence_level_data |>
  filter(has_ref == TRUE) |>
  mutate(sentence_tokens = udpipe(sentence, "english"))