Merge branch 'master' of https://gitea.communitydata.science/mgaughan/kkex_repo
This commit is contained in:
		
						commit
						ea8677c3f1
					
				@ -43,6 +43,11 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg")
 | 
				
			|||||||
#logging
 | 
					#logging
 | 
				
			||||||
all_actions_data$logged_count <- log(all_actions_data$count)
 | 
					all_actions_data$logged_count <- log(all_actions_data$count)
 | 
				
			||||||
all_actions_data$log1p_count <- log1p(all_actions_data$count)
 | 
					all_actions_data$log1p_count <- log1p(all_actions_data$count)
 | 
				
			||||||
 | 
					#EDA
 | 
				
			||||||
 | 
					range(all_actions_data$log1p_count) # 0.000000 6.745236
 | 
				
			||||||
 | 
					mean(all_actions_data$log1p_count) # 1.200043
 | 
				
			||||||
 | 
					var(all_actions_data$log1p_count) # 1.753764
 | 
				
			||||||
 | 
					median(all_actions_data$log1p_count) # 0.6931472
 | 
				
			||||||
# now for merge
 | 
					# now for merge
 | 
				
			||||||
mrg_actions_data$logged_count <- log(mrg_actions_data$count)
 | 
					mrg_actions_data$logged_count <- log(mrg_actions_data$count)
 | 
				
			||||||
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
 | 
					mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
 | 
				
			||||||
 | 
				
			|||||||
@ -45,6 +45,7 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg")
 | 
				
			|||||||
#log the dependent 
 | 
					#log the dependent 
 | 
				
			||||||
all_actions_data$logged_count <- log(all_actions_data$count)
 | 
					all_actions_data$logged_count <- log(all_actions_data$count)
 | 
				
			||||||
all_actions_data$log1p_count <- log1p(all_actions_data$count)
 | 
					all_actions_data$log1p_count <- log1p(all_actions_data$count)
 | 
				
			||||||
 | 
					range(all_actions_data$log1p_count)
 | 
				
			||||||
# 3 rdd in lmer analysis
 | 
					# 3 rdd in lmer analysis
 | 
				
			||||||
# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
 | 
					# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
 | 
				
			||||||
# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
 | 
					# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
 | 
				
			||||||
@ -55,8 +56,10 @@ library(lattice)
 | 
				
			|||||||
#some more EDA to go between Poisson and neg binomial
 | 
					#some more EDA to go between Poisson and neg binomial
 | 
				
			||||||
var(all_actions_data$log1p_count) # 1.125429
 | 
					var(all_actions_data$log1p_count) # 1.125429
 | 
				
			||||||
mean (all_actions_data$log1p_count) # 0.6426873
 | 
					mean (all_actions_data$log1p_count) # 0.6426873
 | 
				
			||||||
 | 
					median(all_actions_data$log1p_count) #0
 | 
				
			||||||
var(all_actions_data$count) # 268.4449
 | 
					var(all_actions_data$count) # 268.4449
 | 
				
			||||||
mean (all_actions_data$count) # 3.757298
 | 
					mean (all_actions_data$count) # 3.757298
 | 
				
			||||||
 | 
					median(all_actions_data$count) # 0
 | 
				
			||||||
#all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1,  control=glmerControl(optimizer="bobyqa",
 | 
					#all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1,  control=glmerControl(optimizer="bobyqa",
 | 
				
			||||||
#                           optCtrl=list(maxfun=1e5)))
 | 
					#                           optCtrl=list(maxfun=1e5)))
 | 
				
			||||||
all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda")
 | 
					all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda")
 | 
				
			||||||
 | 
				
			|||||||
@ -2,6 +2,44 @@ import json
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import csv
 | 
					import csv
 | 
				
			||||||
import pandas as pd
 | 
					import pandas as pd
 | 
				
			||||||
 | 
					from git import Repo
 | 
				
			||||||
 | 
					from tqdm import tqdm
 | 
				
			||||||
 | 
					import shutil 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					temp_dir = "/data/users/mgaughan/tmp3/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def how_many_docs(dataset_csv):
 | 
				
			||||||
 | 
					    df = pd.read_csv(dataset_csv)
 | 
				
			||||||
 | 
					    project_repos = df['upstream_vcs_link'].to_list()
 | 
				
			||||||
 | 
					    print(len(project_repos))
 | 
				
			||||||
 | 
					    readme_count = 0
 | 
				
			||||||
 | 
					    contributing_count  = 0
 | 
				
			||||||
 | 
					    for i in tqdm(range(len(project_repos))):
 | 
				
			||||||
 | 
					        vcs_link = project_repos[i]
 | 
				
			||||||
 | 
					        if "github" in vcs_link or "gitlab" in vcs_link:
 | 
				
			||||||
 | 
					            #making an evaluation that sub branches aren't being used and that people would fork if needed
 | 
				
			||||||
 | 
					            #this only looks at main
 | 
				
			||||||
 | 
					            vcs_link = "/".join(vcs_link.split("/")[0:5])
 | 
				
			||||||
 | 
					            full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
 | 
				
			||||||
 | 
					        vcs_link = vcs_link.strip()
 | 
				
			||||||
 | 
					        repo = Repo.clone_from(vcs_link, full_temp_path)
 | 
				
			||||||
 | 
					        files = os.listdir(full_temp_path)
 | 
				
			||||||
 | 
					        has_readme = False
 | 
				
			||||||
 | 
					        has_contributing = False
 | 
				
			||||||
 | 
					        for file in files:
 | 
				
			||||||
 | 
					            if "README" in file.upper():
 | 
				
			||||||
 | 
					                has_readme = True
 | 
				
			||||||
 | 
					            if "CONTRIBUTING" in file.upper():
 | 
				
			||||||
 | 
					                has_contributing = True
 | 
				
			||||||
 | 
					        if has_readme:
 | 
				
			||||||
 | 
					            readme_count += 1
 | 
				
			||||||
 | 
					        if has_contributing:
 | 
				
			||||||
 | 
					            contributing_count += 1
 | 
				
			||||||
 | 
					        shutil.rmtree(full_temp_path, ignore_errors=True)
 | 
				
			||||||
 | 
					    return readme_count, contributing_count
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def calc_file_denom(project_name):
 | 
					def calc_file_denom(project_name):
 | 
				
			||||||
@ -33,5 +71,7 @@ def for_all_projects():
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    for_all_projects()
 | 
					    #for_all_projects()
 | 
				
			||||||
    #print(calc_file_denom("zzz-to-char"))
 | 
					    #print(calc_file_denom("zzz-to-char"))
 | 
				
			||||||
 | 
					    readmec, contributingc = how_many_docs("final_data/deb_full_data.csv")
 | 
				
			||||||
 | 
					    print("README COUNT: " + str(readmec) + "|| CONTRIBUTING COUNT: " + str(contributingc))
 | 
				
			||||||
							
								
								
									
										26
									
								
								text_analysis/qual_sampling.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								text_analysis/qual_sampling.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,26 @@
 | 
				
			|||||||
 | 
					import csv
 | 
				
			||||||
 | 
					import io 
 | 
				
			||||||
 | 
					import shutil 
 | 
				
			||||||
 | 
					import os 
 | 
				
			||||||
 | 
					from random import sample
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					readme_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme"
 | 
				
			||||||
 | 
					contributing_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_contributing"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def sample_from_doc(sample_k, doc_directory):
 | 
				
			||||||
 | 
					    subdirs = os.listdir(doc_directory)
 | 
				
			||||||
 | 
					    for dir in subdirs: 
 | 
				
			||||||
 | 
					        print(dir)
 | 
				
			||||||
 | 
					        files = os.listdir(doc_directory + "/" + dir)
 | 
				
			||||||
 | 
					        final_sampled = []
 | 
				
			||||||
 | 
					        while len(final_sampled) < sample_k:
 | 
				
			||||||
 | 
					            trial_sample = sample(files, 1)[0]
 | 
				
			||||||
 | 
					            with open(doc_directory + "/" + dir + "/" + trial_sample,"r") as f:
 | 
				
			||||||
 | 
					                file_length = len(f.readlines())
 | 
				
			||||||
 | 
					                if file_length >= 10:
 | 
				
			||||||
 | 
					                    final_sampled.append([trial_sample, file_length])
 | 
				
			||||||
 | 
					        print(final_sampled)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    sample_from_doc(3, readme_wd)
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user