data <- read.csv("/home/mgaughan/git/adaptation-slr/cites/060225_olmo_categorized_citations.csv")
table(data$X1)

library(ggsankey)
library(ggplot2)
library(dplyr)

cols <- c("X1", "X2", "X3", "X4")
data[cols] <- lapply(data[cols], function(x) ifelse(x == "YES", "YES", "NO"))
#data <- data %>%
#  mutate(across(starts_with("X"), ~ ifelse(.x, "Yes", "No")))
data$freq = 1
# ggsankey
# https://r-charts.com/flow/sankey-diagram-ggplot2/
alluvial_plot <- ggplot(data,
       aes(axis1 = X1, axis2 = X2, axis3 = X3, axis4 = X4, y = freq)) +
  geom_alluvium(fill = "grey70", width = 1/12, alpha = 0.7) +
  geom_stratum(aes(fill = after_stat(stratum)), width = 1/12, color = "black") +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 4) +
  scale_x_discrete(limits = c("X1", "X2", "X3", "X4"), expand = c(.05, .05)) +
  scale_fill_manual(values = c("YES" = "#4CAF50", "NO" = "#F44336")) +
  labs(title = "Alluvial Plot: YES/NO at Each Stage",
       x = "Stage", y = "Count") +
  theme_minimal()

alluvial_plot


library(ggsankey)

data$case_id <- seq_len(nrow(data))

sankey_data <- data %>%
  select(case_id, X1, X2, X3, X4) %>%
  pivot_longer(-case_id, names_to = "stage", values_to = "value") %>%
  arrange(case_id, stage)

# Convert to sankey-friendly format
sankey_ready <- sankey_data %>%
  make_long(stage, value, id = case_id)

ggplot(sankey_ready, aes(x = x, 
                         next_x = next_x, 
                         node = node, 
                         next_node = next_node, 
                         fill = factor(node))) +
  geom_sankey(flow.alpha = 0.6, node.color = "black") +
  geom_sankey_label(size = 3, color = "black") +
  theme_sankey(base_size = 12) +
  labs(title = "Sankey Diagram", fill = "Value")