diff --git a/12825_revision/intermediary_script.py b/12825_revision/intermediary_script.py index b41b324..c5ba78e 100644 --- a/12825_revision/intermediary_script.py +++ b/12825_revision/intermediary_script.py @@ -8,10 +8,28 @@ import pandas as pd import datetime import argparse +''' +RUNNING INSTRUCTIONS: +[1] set up tmux environment +[2] edit this file where marked "FIX BELOW" +[3] install pip packages +[4] in your tmux environment, run the following three commands + - os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no' + - os.environ['GIT_ASKPASS'] = 'false' + - os.environ['GIT_TERMINAL_PROMPT'] = '0' +[5] in tmux, run the script as follows with your START and STOP values + - python3 intermediary_script.py --start_index START --stop_index STOP +[6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs + +THANK YOU VERY MUCH - matt +''' + +#FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here temp_dir = "/data/users/mgaughan/tmp3/" cst = datetime.timezone(datetime.timedelta(hours=-6)) from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst) to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst) +#FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/" def temp_clone(vcs_link, temp_location): @@ -126,9 +144,9 @@ def for_all_files(start_index, stop_index): lines = [line for line in file] for row in tqdm(csv.reader(lines), total=len(lines)): index += 1 - #time.sleep(5) if index < start_index: continue + time.sleep(4) if row[0] == "": empty_row += 1 continue @@ -165,7 +183,7 @@ def for_all_files(start_index, stop_index): print(f'inside cloning error: {e}') raise ValueError(e) os.chdir(temp_repo_path) - os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`") + os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00'`") os.chdir(cwd) has_readme_bool, has_contributing_bool = False, False for filename in os.listdir(temp_repo_path): @@ -221,4 +239,6 @@ if __name__ == "__main__": args = parser.parse_args() for_all_files(args.start_index, args.stop_index) #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir) - #delete_clone(temp_dir) \ No newline at end of file + #delete_clone(temp_dir) + + #python3 intermediary_script.py --start_index START --stop_index STOP \ No newline at end of file diff --git a/R/012825_gam_introduction.png b/R/012825_gam_introduction.png new file mode 100644 index 0000000..05ad6bd Binary files /dev/null and b/R/012825_gam_introduction.png differ diff --git a/R/contribRDDAnalysis.R b/R/contribRDDAnalysis.R index 0420bf5..0a977a5 100644 --- a/R/contribRDDAnalysis.R +++ b/R/contribRDDAnalysis.R @@ -51,10 +51,8 @@ quantile(grouped_averages$x) quantile(all_actions_data$before_auth_new) quantile(all_actions_data$after_auth_new) -range(all_actions_data$log1p_count) # 0.000000 6.745236 -mean(all_actions_data$log1p_count) # 1.200043 -sd(all_actions_data$log1p_count) -median(all_actions_data$log1p_count) # 0.6931472 +mean(all_actions_data$count) # 8.440981 +var(all_actions_data$count) #] 542.9546 # now for merge mrg_actions_data$logged_count <- log(mrg_actions_data$count) mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count) diff --git a/R/gam_plot_documents.R b/R/gam_plot_documents.R index e7593eb..88d0b72 100644 --- a/R/gam_plot_documents.R +++ b/R/gam_plot_documents.R @@ -68,14 +68,17 @@ doctypeColors <- , c("CONTRIBUTING", "README")) time_plot <- all_actions_data |> - ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + - labs(x="Weekly Offset", y="Commit Count", color="Document Type") + + ggplot(aes(x=week_offset, y=count, color=factor(document_type))) + + scale_y_continuous(trans = 'log1p', labels = scales::comma) + + labs(x="Weekly Offset", y="Commit Count", color="Document Type: ") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot + +ggsave(filename = "012825_gam_introduction.png", plot = time_plot, width = 8, height = 6, dpi = 500) #code to change the axes #scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), diff --git a/R/readmeRDDAnalysis.R b/R/readmeRDDAnalysis.R index 474752b..4d8bf7c 100644 --- a/R/readmeRDDAnalysis.R +++ b/R/readmeRDDAnalysis.R @@ -59,8 +59,8 @@ library(lme4) library(optimx) library(lattice) #some more EDA to go between Poisson and neg binomial -var(all_actions_data$log1p_count) # 1.125429 -mean (all_actions_data$log1p_count) # 0.6426873 +var(all_actions_data$count) # 268.4449 +mean (all_actions_data$count) # 3.757298 sd(all_actions_data$log1p_count) median(all_actions_data$log1p_count) #0 var(all_actions_data$count) # 268.4449