updating scripts with camera ready information

This commit is contained in:
Matthew Gaughan 2025-01-29 10:31:55 -06:00
parent 2d9ce17e3a
commit 723dce0cf9
5 changed files with 32 additions and 11 deletions

View File

@ -8,10 +8,28 @@ import pandas as pd
import datetime import datetime
import argparse import argparse
'''
RUNNING INSTRUCTIONS:
[1] set up tmux environment
[2] edit this file where marked "FIX BELOW"
[3] install pip packages
[4] in your tmux environment, run the following three commands
- os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
- os.environ['GIT_ASKPASS'] = 'false'
- os.environ['GIT_TERMINAL_PROMPT'] = '0'
[5] in tmux, run the script as follows with your START and STOP values
- python3 intermediary_script.py --start_index START --stop_index STOP
[6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs
THANK YOU VERY MUCH - matt
'''
#FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here
temp_dir = "/data/users/mgaughan/tmp3/" temp_dir = "/data/users/mgaughan/tmp3/"
cst = datetime.timezone(datetime.timedelta(hours=-6)) cst = datetime.timezone(datetime.timedelta(hours=-6))
from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst) from_date = datetime.datetime(1970, 1, 1, 12, 00, 00, tzinfo=cst)
to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst) to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
#FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them
COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/" COMMIT_SAVE_PREFIX = "/data/users/mgaughan/kkex/012825_cam_revision_main/"
def temp_clone(vcs_link, temp_location): def temp_clone(vcs_link, temp_location):
@ -126,9 +144,9 @@ def for_all_files(start_index, stop_index):
lines = [line for line in file] lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)): for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1 index += 1
#time.sleep(5)
if index < start_index: if index < start_index:
continue continue
time.sleep(4)
if row[0] == "": if row[0] == "":
empty_row += 1 empty_row += 1
continue continue
@ -165,7 +183,7 @@ def for_all_files(start_index, stop_index):
print(f'inside cloning error: {e}') print(f'inside cloning error: {e}')
raise ValueError(e) raise ValueError(e)
os.chdir(temp_repo_path) os.chdir(temp_repo_path)
os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00' master`") os.system(f"git checkout `git rev-list -n 1 --before='2024-03-16 12:00:00'`")
os.chdir(cwd) os.chdir(cwd)
has_readme_bool, has_contributing_bool = False, False has_readme_bool, has_contributing_bool = False, False
for filename in os.listdir(temp_repo_path): for filename in os.listdir(temp_repo_path):
@ -221,4 +239,6 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
for_all_files(args.start_index, args.stop_index) for_all_files(args.start_index, args.stop_index)
#temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir) #temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
#delete_clone(temp_dir) #delete_clone(temp_dir)
#python3 intermediary_script.py --start_index START --stop_index STOP

Binary file not shown.

After

Width:  |  Height:  |  Size: 316 KiB

View File

@ -51,10 +51,8 @@ quantile(grouped_averages$x)
quantile(all_actions_data$before_auth_new) quantile(all_actions_data$before_auth_new)
quantile(all_actions_data$after_auth_new) quantile(all_actions_data$after_auth_new)
range(all_actions_data$log1p_count) # 0.000000 6.745236 mean(all_actions_data$count) # 8.440981
mean(all_actions_data$log1p_count) # 1.200043 var(all_actions_data$count) #] 542.9546
sd(all_actions_data$log1p_count)
median(all_actions_data$log1p_count) # 0.6931472
# now for merge # now for merge
mrg_actions_data$logged_count <- log(mrg_actions_data$count) mrg_actions_data$logged_count <- log(mrg_actions_data$count)
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count) mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)

View File

@ -68,14 +68,17 @@ doctypeColors <-
, c("CONTRIBUTING", "README")) , c("CONTRIBUTING", "README"))
time_plot <- all_actions_data |> time_plot <- all_actions_data |>
ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + ggplot(aes(x=week_offset, y=count, color=factor(document_type))) +
labs(x="Weekly Offset", y="Commit Count", color="Document Type") + scale_y_continuous(trans = 'log1p', labels = scales::comma) +
labs(x="Weekly Offset", y="Commit Count", color="Document Type: ") +
scale_color_manual(values = doctypeColors) + scale_color_manual(values = doctypeColors) +
geom_smooth() + geom_smooth() +
geom_vline(xintercept = 0)+ geom_vline(xintercept = 0)+
theme_bw() + theme_bw() +
theme(legend.position = "top") theme(legend.position = "top")
time_plot time_plot
ggsave(filename = "012825_gam_introduction.png", plot = time_plot, width = 8, height = 6, dpi = 500)
#code to change the axes #code to change the axes
#scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), #scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5),

View File

@ -59,8 +59,8 @@ library(lme4)
library(optimx) library(optimx)
library(lattice) library(lattice)
#some more EDA to go between Poisson and neg binomial #some more EDA to go between Poisson and neg binomial
var(all_actions_data$log1p_count) # 1.125429 var(all_actions_data$count) # 268.4449
mean (all_actions_data$log1p_count) # 0.6426873 mean (all_actions_data$count) # 3.757298
sd(all_actions_data$log1p_count) sd(all_actions_data$log1p_count)
median(all_actions_data$log1p_count) #0 median(all_actions_data$log1p_count) #0
var(all_actions_data$count) # 268.4449 var(all_actions_data$count) # 268.4449