99 lines
3.3 KiB
R
99 lines
3.3 KiB
R
library(tidyverse)
|
|
c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv"
|
|
c2_input_df <- read.csv(c2_count , header = TRUE)
|
|
|
|
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv"
|
|
c3_input_df <- read.csv(c3_count , header = TRUE)
|
|
|
|
#sampling relevance for the text of the Task description
|
|
http_relevant <- function(text) {
|
|
if (is.na(text)) {
|
|
return(FALSE)
|
|
}
|
|
# expanded dictionary for relevancy: http, login, SSL, TLS, certificate
|
|
words <- unlist(strsplit(text, "\\s+"))
|
|
for (word in words) {
|
|
lw <- tolower(word)
|
|
if (!grepl("://", lw, fixed = TRUE)) {
|
|
# http
|
|
if (grepl("http", lw, fixed = TRUE)) {
|
|
return(TRUE)
|
|
}
|
|
# login
|
|
if (grepl("login", lw, fixed = TRUE)) {
|
|
return(TRUE)
|
|
}
|
|
# ssl
|
|
if (grepl("ssl", lw, fixed = TRUE)) {
|
|
return(TRUE)
|
|
}
|
|
# tls
|
|
if (grepl("tls", lw, fixed = TRUE)) {
|
|
return(TRUE)
|
|
}
|
|
# cert (but not "certain")
|
|
if (startsWith(lw, "cert") && !startsWith(lw, "certain")) {
|
|
return(TRUE)
|
|
}
|
|
}
|
|
}
|
|
return(FALSE)
|
|
}
|
|
|
|
library(dplyr)
|
|
|
|
|
|
c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv"
|
|
c1_input_df <- read.csv(c1_count , header = TRUE)
|
|
#C1: after 11-11-2012 before 10-1-2013
|
|
c1_desc_flags <- c1_input_df %>%
|
|
filter(comment_type == "task_description") %>%
|
|
mutate(time_flag = date_created >= 1352592000 & date_created <= 1380671999 ) |>
|
|
select(TaskPHID, time_flag)
|
|
|
|
c1_flagged <- c1_input_df %>%
|
|
left_join(c1_desc_flags, by = "TaskPHID")
|
|
|
|
c1_sampled <- c1_flagged |>
|
|
filter(time_flag == TRUE)
|
|
sum(c1_sampled$comment_type == "task_description")
|
|
|
|
|
|
write.csv(c1_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv", row.names=FALSE)
|
|
|
|
# going off of the dateCreated of the Task
|
|
# C2: after 9-3-2011 before 11-27-2013
|
|
c2_desc_flags <- c2_input_df %>%
|
|
filter(comment_type == "task_description") %>%
|
|
mutate(http_flag = sapply(task_title, http_relevant)) |>
|
|
mutate(time_flag = date_created >= 1315008000 & date_created <= 1385596799) |>
|
|
select(TaskPHID, http_flag, time_flag)
|
|
|
|
#mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
|
|
|
|
c2_flagged <- c2_input_df %>%
|
|
left_join(c2_desc_flags, by = "TaskPHID")
|
|
|
|
c2_sampled <- c2_flagged |>
|
|
filter(http_flag == TRUE) |>
|
|
filter(time_flag == TRUE)
|
|
sum(c2_sampled$comment_type == "task_description")
|
|
|
|
write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv", row.names=FALSE)
|
|
# C3: after 07-01-2013 before 10-01-2015
|
|
c3_desc_flags <- c3_input_df %>%
|
|
filter(comment_type == "task_description") %>%
|
|
mutate(http_flag = sapply(task_title, http_relevant)) |>
|
|
mutate(time_flag = date_created >= 1372636800 & date_created <= 1443743999) |>
|
|
select(TaskPHID, http_flag, time_flag)
|
|
|
|
c3_flagged <- c3_input_df %>%
|
|
left_join(c3_desc_flags, by = "TaskPHID")
|
|
|
|
c3_sampled <- c3_flagged |>
|
|
filter(http_flag == TRUE) |>
|
|
filter(time_flag == TRUE)
|
|
sum(c3_sampled$comment_type == "task_description")
|
|
|
|
write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv", row.names=FALSE)
|