49 lines
1.6 KiB
R
49 lines
1.6 KiB
R
library(dplyr)
|
|
library(ggplot2)
|
|
phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv"
|
|
phab_data <- read.csv(phab_data_path, header=TRUE)
|
|
|
|
phab_data <- phab_data |>
|
|
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |>
|
|
mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |>
|
|
mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
|
mutate(comment_id = row_number())|>
|
|
filter(date_created < 1383264000 & date_created > 1351728000)
|
|
#looking at all data between 11-1-2012 and 11-1-2013
|
|
|
|
length(unique(phab_data$date_created))
|
|
|
|
#g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) +
|
|
# geom_point(alpha = 0.5) +
|
|
# theme_minimal()
|
|
#g
|
|
|
|
library(udpipe)
|
|
#library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file
|
|
|
|
library(tidytext)
|
|
library(dplyr)
|
|
library(stringr)
|
|
|
|
# we first need to transform our comment level of analysis into sentences
|
|
sentence_level_data <- phab_data |>
|
|
unnest_tokens(sentence, comment_text, token = "sentences") |>
|
|
group_by(comment_id) |>
|
|
mutate(sentence_id = row_number())|>
|
|
dplyr::select(-has_bot_ref, -has_ref)|>
|
|
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |>
|
|
mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |>
|
|
ungroup()
|
|
|
|
|
|
library(udpipe)
|
|
library(rsyntax)
|
|
# Load necessary libraries
|
|
library(spacyr)
|
|
spacy_install()
|
|
#we only care about stuff that mentions VE rn, then tokenize
|
|
sentence_level_data <- sentence_level_data |>
|
|
filter(has_ref == TRUE) |>
|
|
mutate(sentence_tokens = udpipe(sentence, "english"))
|
|
|