diff --git a/exploratory-analysis/syphilis_traffic.Rmd b/exploratory-analysis/syphilis_traffic.Rmd new file mode 100644 index 0000000..ebbb0cc --- /dev/null +++ b/exploratory-analysis/syphilis_traffic.Rmd @@ -0,0 +1,142 @@ +--- +title: "syphilis traffic" +output: html_document +date: "2024-11-19" +--- + +```{r} + +library(ggplot2) +library(dplyr) + +posts_per_year = syphilisall %>% + filter(!is.na(Year) & Year != 2021) %>% + group_by(Year) %>% + summarise(Number_of_Posts = n()) + +ggplot(posts_per_year, aes(x = Year, y = Number_of_Posts)) + + geom_line() + + geom_point() + + labs(title = "Volume of Syphilis Traffic Over Time", + x = "Year", + y = "Number of Posts Across All of Reddit") + + +``` + + + +```{r} + +sum(is.na(syphilisall$Year)) +sum(syphilisall$Year == "") + +``` + + + +```{r} + +posts_per_year = syphilisall %>% + filter(!is.na(Year) & Year != 2021) %>% # Exclude NA values and 2021 + group_by(Year, subreddit) %>% # Group by both 'Year' and 'subreddit' + summarise(Number_of_Posts = n(), .groups = "drop") # Count the number of posts per subreddit and year + +# Plotting the result +ggplot(posts_per_year, aes(x = Year, y = Number_of_Posts, color = subreddit)) + + geom_line(aes(group = subreddit)) + # Draw a line for each subreddit + geom_point() + # Add points for each year/subreddit combination + theme(legend.position="none") + + labs(title = "Volume of Syphilis Traffic Over Time", + x = "Year", + y = "Number of Posts per Subreddit") + +``` + +```{r} + +unique_values = unique(syphilisall$subreddit) + +length(unique_values) + +``` + +```{r} + +top_subreddits = syphilisall %>% + filter(!is.na(Year) & Year != 2021) %>% + group_by(subreddit) %>% + summarise(total_posts = n(), .groups = "drop") %>% + top_n(10, total_posts) + +posts_per_year = syphilisall %>% + filter(subreddit %in% top_subreddits$subreddit, !is.na(Year) & Year != 2021) %>% + group_by(Year, subreddit) %>% + summarise(Number_of_Posts = n(), .groups = "drop") + +ggplot(posts_per_year, aes(x = Year, y = Number_of_Posts, color = subreddit)) + + geom_line(aes(group = subreddit)) + + geom_point() + + labs(title = "Volume of Syphilis Traffic Over Time", + x = "Year", + y = "Number of Posts per Subreddit") + +``` + + +```{r} + +sum(is.na(syphilisall$subreddit)) + +sum(syphilisall$subreddit == "") + + +``` + + +```{r} + +top_subreddits = syphilisall %>% + filter(!is.na(Year) & Year != 2021, subreddit != "") %>% + group_by(subreddit) %>% + summarise(total_posts = n(), .groups = "drop") %>% + top_n(10, total_posts) + +posts_per_year = syphilisall %>% + filter(subreddit %in% top_subreddits$subreddit, !is.na(Year) & Year != 2021) %>% + group_by(Year, subreddit) %>% + summarise(Number_of_Posts = n(), .groups = "drop") + +ggplot(posts_per_year, aes(x = Year, y = Number_of_Posts, color = subreddit)) + + geom_line(aes(group = subreddit)) + + geom_point() + + labs(title = "Volume of Syphilis Traffic Over Time in Top Ten Subreddits", + x = "Year", + y = "Number of Posts per Subreddit") + +``` + + +```{r} + +top_subreddits = syphilistotal %>% + filter(!is.na(Year) & Year != 2021) %>% + group_by(subreddit) %>% + summarise(total_posts = n(), .groups = "drop") %>% + top_n(10, total_posts) + +posts_per_year = syphilistotal %>% + filter(subreddit %in% top_subreddits$subreddit, !is.na(Year) & Year != 2021) %>% + group_by(Year, subreddit) %>% + summarise(Number_of_Posts = n(), .groups = "drop") + +ggplot(posts_per_year, aes(x = Year, y = Number_of_Posts, color = subreddit)) + + geom_line(aes(group = subreddit)) + + geom_point() + + labs(title = "Volume of Syphilis Traffic Over Time", + x = "Year", + y = "Number of Posts per Subreddit") + +``` + +