updating scripts with camera ready information

2025-01-29 10:31:55 -06:00 · 2025-01-29 10:31:55 -06:00 · 723dce0cf9
commit 723dce0cf9
parent 2d9ce17e3a
5 changed files with 32 additions and 11 deletions
--- a/12825_revision/intermediary_script.py
+++ b/12825_revision/intermediary_script.py
@ -8,10 +8,28 @@ import pandas as pd
 import datetime
 import argparse
 '''
 RUNNING INSTRUCTIONS:
 [1] set up tmux environment
 [2] edit this file where marked "FIX BELOW"
 [3] install pip packages
 [4] in your tmux environment, run the following three commands
    - os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
    - os.environ['GIT_ASKPASS'] = 'false'
    - os.environ['GIT_TERMINAL_PROMPT'] = '0'
 [5] in tmux, run the script as follows with your START and STOP values
    - python3 intermediary_script.py --start_index START --stop_index STOP
 [6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs
 THANK YOU VERY MUCH - matt
 '''
 #FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here
 temp_dir = "/data/users/mgaughan/tmp3/"
 cst = datetime.timezone(datetime.timedelta(hours=-6))
 from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst)
 to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
 #FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them
 COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/"
 def temp_clone(vcs_link, temp_location):
@ -126,9 +144,9 @@ def for_all_files(start_index, stop_index):
        lines = [line for line in file]
        for row in tqdm(csv.reader(lines), total=len(lines)):
            index += 1
            #time.sleep(5)
            if index < start_index:
                continue
            time.sleep(4)
            if row[0] == "":
                empty_row += 1
                continue
@ -165,7 +183,7 @@ def for_all_files(start_index, stop_index):
                    print(f'inside cloning error: {e}')
                    raise ValueError(e)
                os.chdir(temp_repo_path)
-                os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`")
+                os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00'`")
                os.chdir(cwd)
                has_readme_bool, has_contributing_bool = False, False
                for filename in os.listdir(temp_repo_path):
@ -222,3 +240,5 @@ if __name__ == "__main__":
    for_all_files(args.start_index, args.stop_index)
    #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
    #delete_clone(temp_dir)
    #python3 intermediary_script.py --start_index START --stop_index STOP
--- a/R/012825_gam_introduction.png
+++ b/R/012825_gam_introduction.png
--- a/R/contribRDDAnalysis.R
+++ b/R/contribRDDAnalysis.R
@ -51,10 +51,8 @@ quantile(grouped_averages$x)
 quantile(all_actions_data$before_auth_new)
 quantile(all_actions_data$after_auth_new)
-range(all_actions_data$log1p_count) # 0.000000 6.745236
+mean(all_actions_data$count) #  8.440981
-mean(all_actions_data$log1p_count) # 1.200043
+var(all_actions_data$count) #] 542.9546
 sd(all_actions_data$log1p_count) 
 median(all_actions_data$log1p_count) # 0.6931472
 # now for merge
 mrg_actions_data$logged_count <- log(mrg_actions_data$count)
 mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
--- a/R/gam_plot_documents.R
+++ b/R/gam_plot_documents.R
@ -68,14 +68,17 @@ doctypeColors <-
            , c("CONTRIBUTING", "README"))
 time_plot <- all_actions_data |>
-  ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) +
+  ggplot(aes(x=week_offset, y=count, color=factor(document_type))) +
-  labs(x="Weekly Offset", y="Commit Count", color="Document Type") +
+  scale_y_continuous(trans = 'log1p', labels = scales::comma) +
  labs(x="Weekly Offset", y="Commit Count", color="Document Type: ") +
  scale_color_manual(values = doctypeColors) + 
  geom_smooth() +
  geom_vline(xintercept = 0)+
  theme_bw() +
  theme(legend.position = "top")
 time_plot
 ggsave(filename = "012825_gam_introduction.png", plot = time_plot, width = 8, height = 6, dpi = 500)
 #code to change the axes
 #scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), 
--- a/R/readmeRDDAnalysis.R
+++ b/R/readmeRDDAnalysis.R
@ -59,8 +59,8 @@ library(lme4)
 library(optimx)
 library(lattice)
 #some more EDA to go between Poisson and neg binomial
-var(all_actions_data$log1p_count) # 1.125429
+var(all_actions_data$count) # 268.4449
-mean (all_actions_data$log1p_count) # 0.6426873
+mean (all_actions_data$count) # 3.757298
 sd(all_actions_data$log1p_count)
 median(all_actions_data$log1p_count) #0
 var(all_actions_data$count) # 268.4449