1
0
mw-lifecycle-analysis/text_analysis/.ipynb_checkpoints/longitudinal_analysis-checkpoint.R
2025-04-04 13:51:35 -07:00

49 lines
1.6 KiB
R

library(dplyr)
library(ggplot2)
phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv"
phab_data <- read.csv(phab_data_path, header=TRUE)
phab_data <- phab_data |>
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |>
mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |>
mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
mutate(comment_id = row_number())|>
filter(date_created < 1383264000 & date_created > 1351728000)
#looking at all data between 11-1-2012 and 11-1-2013
length(unique(phab_data$date_created))
#g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) +
# geom_point(alpha = 0.5) +
# theme_minimal()
#g
library(udpipe)
#library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file
library(tidytext)
library(dplyr)
library(stringr)
# we first need to transform our comment level of analysis into sentences
sentence_level_data <- phab_data |>
unnest_tokens(sentence, comment_text, token = "sentences") |>
group_by(comment_id) |>
mutate(sentence_id = row_number())|>
dplyr::select(-has_bot_ref, -has_ref)|>
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |>
mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |>
ungroup()
library(udpipe)
library(rsyntax)
# Load necessary libraries
library(spacyr)
spacy_install()
#we only care about stuff that mentions VE rn, then tokenize
sentence_level_data <- sentence_level_data |>
filter(has_ref == TRUE) |>
mutate(sentence_tokens = udpipe(sentence, "english"))