initial analysis
This commit is contained in:
		
							parent
							
								
									207cf61e88
								
							
						
					
					
						commit
						3c4ea14c81
					
				@ -5,12 +5,12 @@ library(dplyr)
 | 
				
			|||||||
library(lubridate)
 | 
					library(lubridate)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#for a given file we want to get the count data and produce a csv
 | 
					#for a given file we want to get the count data and produce a csv
 | 
				
			||||||
readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/13125_test_README_publication_commits.csv"
 | 
					readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/README_publication_commits.csv"
 | 
				
			||||||
contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/13125_test_CONTRIBUTING_publication_commits.csv"
 | 
					contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/CONTRIBUTING_publication_commits.csv"
 | 
				
			||||||
readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/readme/"
 | 
					readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/readme/"
 | 
				
			||||||
contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/"
 | 
					contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/contributing/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv"
 | 
					#test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
transform_commit_data <- function(filepath, ref_df){
 | 
					transform_commit_data <- function(filepath, ref_df){
 | 
				
			||||||
  #basic, loading in the file 
 | 
					  #basic, loading in the file 
 | 
				
			||||||
@ -31,7 +31,7 @@ transform_commit_data <- function(filepath, ref_df){
 | 
				
			|||||||
  #find the publication entry, in the specified df
 | 
					  #find the publication entry, in the specified df
 | 
				
			||||||
  matched_entry <- ref_df |>
 | 
					  matched_entry <- ref_df |>
 | 
				
			||||||
    filter(repo_id == project_id)
 | 
					    filter(repo_id == project_id)
 | 
				
			||||||
  commit_date <- as.Date(matched_entry$commit_date)
 | 
					  commit_date <- min(as.Date(matched_entry$commit_date))
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  #get information about project age either in the "present" 
 | 
					  #get information about project age either in the "present" 
 | 
				
			||||||
  #or at the time of first commit
 | 
					  #or at the time of first commit
 | 
				
			||||||
@ -134,16 +134,15 @@ transform_directory_of_commit_data <- function(is_readme) {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#below is for contributing file 
 | 
					#below is for contributing file 
 | 
				
			||||||
test_big_df <- transform_directory_of_commit_data(is_readme=FALSE)
 | 
					#test_big_df <- transform_directory_of_commit_data(is_readme=FALSE)
 | 
				
			||||||
output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/013125_weekly_count_CONTRIBUTING.csv"
 | 
					#output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv"
 | 
				
			||||||
#below is for readme
 | 
					#below is for readme
 | 
				
			||||||
#test_big_df <- transform_directory_of_commit_data(is_readme=TRUE)
 | 
					big_df <- transform_directory_of_commit_data(is_readme=TRUE)
 | 
				
			||||||
#output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/013125_weekly_count_README.csv"
 | 
					output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
 | 
				
			||||||
 | 
					 | 
				
			||||||
#validation testing 
 | 
					#validation testing 
 | 
				
			||||||
#length(unique(test_big_df$project_id))
 | 
					length(unique(big_df$project_id))
 | 
				
			||||||
#filtered_df <- test_big_df %>%
 | 
					#filtered_df <- test_big_df %>%
 | 
				
			||||||
#  filter(commit_count != 0, new_author_emails == 0, new_committer_emails == 0)
 | 
					#  filter(commit_count != 0, new_author_emails == 0, new_committer_emails == 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#another graceful exit
 | 
					#another graceful exit
 | 
				
			||||||
#test_big_df.to_csv(output_filepath, index=False)
 | 
					write.csv(big_df, output_filepath, row.names = FALSE)
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										24
									
								
								mlm/contributing_did_model_fit.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								mlm/contributing_did_model_fit.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,24 @@
 | 
				
			|||||||
 | 
					library(dplyr)
 | 
				
			||||||
 | 
					library(lubridate)
 | 
				
			||||||
 | 
					library(rdd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv"
 | 
				
			||||||
 | 
					contributing_df = read.csv(contributing_df_filepath, header = TRUE) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					window_num <- 5
 | 
				
			||||||
 | 
					contributing_df <- contributing_df |>
 | 
				
			||||||
 | 
					  filter(week_index >= (- window_num) & week_index <= (window_num)) |>
 | 
				
			||||||
 | 
					  mutate(scaled_age = scale(age)) |>
 | 
				
			||||||
 | 
					  mutate(scaled_age_at_commit = scale(age_at_commit))|>
 | 
				
			||||||
 | 
					  mutate(log1p_count = log1p(commit_count))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					library(lme4)
 | 
				
			||||||
 | 
					library(optimx)
 | 
				
			||||||
 | 
					library(lattice)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					all_gmodel <- glmer.nb(log1p_count ~ before_after * week_index + scaled_age + (before_after * week_index | project_id),
 | 
				
			||||||
 | 
					                       control=glmerControl(optimizer="bobyqa",
 | 
				
			||||||
 | 
					                                            optCtrl=list(maxfun=2e5)), nAGQ=0,
 | 
				
			||||||
 | 
					                       data=contributing_df)
 | 
				
			||||||
 | 
					                       
 | 
				
			||||||
 | 
					summary(all_gmodel)
 | 
				
			||||||
							
								
								
									
										30
									
								
								mlm/contributing_did_prep.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								mlm/contributing_did_prep.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,30 @@
 | 
				
			|||||||
 | 
					library(dplyr)
 | 
				
			||||||
 | 
					library(lubridate)
 | 
				
			||||||
 | 
					library(rdd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv"
 | 
				
			||||||
 | 
					df = read.csv(contributing_df_filepath, header = TRUE) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#EDA 
 | 
				
			||||||
 | 
					var(df$commit_count) # 325.5261
 | 
				
			||||||
 | 
					mean(df$commit_count) # 7.743385
 | 
				
			||||||
 | 
					median(df$commit_count) # 1
 | 
				
			||||||
 | 
					mean(df$age) # 4838.649 days
 | 
				
			||||||
 | 
					mean(df$age_at_commit) # 2141.996 days
 | 
				
			||||||
 | 
					median(df$age) # 4597 days
 | 
				
			||||||
 | 
					median(df$age_at_commit) # 1603 days
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# scale and log-transform
 | 
				
			||||||
 | 
					df$scaled_age <- scale(df$age)
 | 
				
			||||||
 | 
					df$scaled_age_at_commit <- scale(df$age_at_commit)
 | 
				
			||||||
 | 
					df$log1p_count <- log1p(df$commit_count)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#getting IK Bandwidth
 | 
				
			||||||
 | 
					get_optimal_bandwidth <- function(df){
 | 
				
			||||||
 | 
					  IKbandwidth(df$week_index, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					mean_optimal_bandwidth <- df %>%
 | 
				
			||||||
 | 
					  group_by(project_id) %>%
 | 
				
			||||||
 | 
					  summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>%
 | 
				
			||||||
 | 
					  summarise(mean_optimal_bandwidth = mean(optimal_bandwidth))
 | 
				
			||||||
							
								
								
									
										44
									
								
								mlm/gam_plot.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								mlm/gam_plot.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,44 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv"
 | 
				
			||||||
 | 
					contributing_df = read.csv(contributing_df_filepath, header = TRUE) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
 | 
				
			||||||
 | 
					readme_df = read.csv(readme_df_filepath, header = TRUE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					window_num <- 5
 | 
				
			||||||
 | 
					contributing_df <- contributing_df |>
 | 
				
			||||||
 | 
					  filter(week_index >= (- window_num) & week_index <= (window_num)) |>
 | 
				
			||||||
 | 
					  mutate(doc_type = "CONTRIBUTING")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					readme_df <- readme_df |>
 | 
				
			||||||
 | 
					  filter(week_index >= (- window_num) & week_index <= (window_num)) |>
 | 
				
			||||||
 | 
					  mutate(doc_type = "README")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					main_df <- rbind(contributing_df, readme_df)
 | 
				
			||||||
 | 
					main_df$log1p_count <- log1p(main_df$commit_count)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					library(scales)
 | 
				
			||||||
 | 
					library(ggplot2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					expm1_trans <- trans_new(
 | 
				
			||||||
 | 
					  name = 'expm1',
 | 
				
			||||||
 | 
					  transform = function(x) expm1(x),
 | 
				
			||||||
 | 
					  inverse = function(x) log1p(x)
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					doctypeColors <-
 | 
				
			||||||
 | 
					  setNames( c('#5da2d8', '#c7756a')
 | 
				
			||||||
 | 
					            , c("CONTRIBUTING", "README"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					time_plot <- main_df |>
 | 
				
			||||||
 | 
					  ggplot(aes(x=week_index, y=commit_count, color=factor(doc_type))) +
 | 
				
			||||||
 | 
					  scale_y_continuous(trans = 'log1p', labels = scales::comma) +
 | 
				
			||||||
 | 
					  labs(x="Weekly Offset", y="Commit Count", color="Document Type: ") +
 | 
				
			||||||
 | 
					  scale_color_manual(values = doctypeColors) + 
 | 
				
			||||||
 | 
					  geom_smooth() +
 | 
				
			||||||
 | 
					  geom_vline(xintercept = 0)+
 | 
				
			||||||
 | 
					  theme_bw() +
 | 
				
			||||||
 | 
					  theme(legend.position = "top")
 | 
				
			||||||
 | 
					time_plot
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ggsave(filename = "plots/cr-020225-gam-introduction.png", plot = time_plot, width = 8, height = 6, dpi = 700)
 | 
				
			||||||
							
								
								
									
										
											BIN
										
									
								
								mlm/plots/cr-020225-gam-introduction.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								mlm/plots/cr-020225-gam-introduction.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| 
		 After Width: | Height: | Size: 493 KiB  | 
							
								
								
									
										0
									
								
								mlm/readme_did_model_fit.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								mlm/readme_did_model_fit.R
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										34
									
								
								mlm/readme_did_prep.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								mlm/readme_did_prep.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,34 @@
 | 
				
			|||||||
 | 
					library(tidyverse)
 | 
				
			||||||
 | 
					library(dplyr)
 | 
				
			||||||
 | 
					library(lubridate)
 | 
				
			||||||
 | 
					library(rdd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
 | 
				
			||||||
 | 
					df = read.csv(readme_df_filepath, header = TRUE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#EDA 
 | 
				
			||||||
 | 
					var(df$commit_count) # 112.4945
 | 
				
			||||||
 | 
					mean(df$commit_count) # 2.431342
 | 
				
			||||||
 | 
					median(df$commit_count) # 0
 | 
				
			||||||
 | 
					mean(df$age) # 4911.734 days
 | 
				
			||||||
 | 
					mean(df$age_at_commit) # 197.296 days
 | 
				
			||||||
 | 
					median(df$age) # 4689 days
 | 
				
			||||||
 | 
					median(df$age_at_commit) # 0 days
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# scale and log-transform
 | 
				
			||||||
 | 
					df$scaled_age <- scale(df$age)
 | 
				
			||||||
 | 
					df$scaled_age_at_commit <- scale(df$age_at_commit)
 | 
				
			||||||
 | 
					df$log1p_count <- log1p(df$commit_count)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#getting IK Bandwidth
 | 
				
			||||||
 | 
					get_optimal_bandwidth <- function(df){
 | 
				
			||||||
 | 
					  IKbandwidth(df$week_index, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					mean_optimal_bandwidth <- df %>%
 | 
				
			||||||
 | 
					  group_by(project_id) %>%
 | 
				
			||||||
 | 
					  summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>%
 | 
				
			||||||
 | 
					  summarise(mean_optimal_bandwidth = mean(optimal_bandwidth))
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					#Mean Optimal Bandwidth: 5.44841
 | 
				
			||||||
@ -3,7 +3,7 @@
 | 
				
			|||||||
#SBATCH --job-name=mg-govdoc-cr
 | 
					#SBATCH --job-name=mg-govdoc-cr
 | 
				
			||||||
#SBATCH --partition=cpu-g2-mem2x  #update this line - use hyakalloc to find partitions you can use
 | 
					#SBATCH --partition=cpu-g2-mem2x  #update this line - use hyakalloc to find partitions you can use
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#SBATCH --time=04:00:00
 | 
					#SBATCH --time=05:00:00
 | 
				
			||||||
#SBATCH --nodes=1
 | 
					#SBATCH --nodes=1
 | 
				
			||||||
#SBATCH --ntasks=4
 | 
					#SBATCH --ntasks=4
 | 
				
			||||||
#SBATCH --mem=64G
 | 
					#SBATCH --mem=64G
 | 
				
			||||||
 | 
				
			|||||||
@ -0,0 +1,112 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					 "cells": [
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "f4c4796f-d109-472d-8f9c-95c6ec85f757",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "import os \n",
 | 
				
			||||||
 | 
					    "import textstat\n",
 | 
				
			||||||
 | 
					    "import csv"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "8f1f2fce-2335-4ee3-81f2-55822e2f63f9",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "readme_wd = \"\"\n",
 | 
				
			||||||
 | 
					    "contributing_wd = \"\"\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "csv_fieldnames = ['subdir', 'filename', 'flesch_reading_ease', 'flesch_kincaid_grade', 'linsear_write_formula', 'dale_chall_readability_score', 'mcalpine_eflaw', 'reading_time', 'char_count', 'word_count']"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "a0d3b5b1-ae97-4a46-95e0-92232c46c2fa",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "'''\n",
 | 
				
			||||||
 | 
					    "gets the 3 readability scores for each individual textfile\n",
 | 
				
			||||||
 | 
					    "'''\n",
 | 
				
			||||||
 | 
					    "def get_readibility(file_address, file_dict):\n",
 | 
				
			||||||
 | 
					    "    file = open(file_address, \"r\")\n",
 | 
				
			||||||
 | 
					    "    document = file.read()\n",
 | 
				
			||||||
 | 
					    "    file_dict['flesch_reading_ease'] = textstat.flesch_reading_ease(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['linsear_write_formula'] = textstat.linsear_write_formula(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['dale_chall_readability_score'] = textstat.dale_chall_readability_score(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['mcalpine_eflaw'] = textstat.mcalpine_eflaw(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['reading_time']  = textstat.reading_time(document, ms_per_char=14.69)\n",
 | 
				
			||||||
 | 
					    "    file_dict['char_count'] = textstat.char_count(document, ignore_spaces=True)\n",
 | 
				
			||||||
 | 
					    "    file_dict['word_count'] = textstat.lexicon_count(document, removepunct=True)\n",
 | 
				
			||||||
 | 
					    "    return file_dict\n"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "8b3c481e-c521-4e1d-926e-88f4b75ae7de",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "'''\n",
 | 
				
			||||||
 | 
					    "getting readability scoring for each type of document\n",
 | 
				
			||||||
 | 
					    "'''\n",
 | 
				
			||||||
 | 
					    "def generate_file(output_csv, wdirectory, document_type):\n",
 | 
				
			||||||
 | 
					    "    with open(output_csv, 'w') as csvfile: \n",
 | 
				
			||||||
 | 
					    "        writer = csv.DictWriter(csvfile, fieldnames = csv_fieldnames) \n",
 | 
				
			||||||
 | 
					    "        writer.writeheader() \n",
 | 
				
			||||||
 | 
					    "        subdirs = os.listdir(wdirectory)\n",
 | 
				
			||||||
 | 
					    "        print(document_type)\n",
 | 
				
			||||||
 | 
					    "        for dir in subdirs: \n",
 | 
				
			||||||
 | 
					    "            print(dir)\n",
 | 
				
			||||||
 | 
					    "            files = os.listdir(wdirectory + \"/\" + dir)\n",
 | 
				
			||||||
 | 
					    "            count = 0\n",
 | 
				
			||||||
 | 
					    "            for file in files:\n",
 | 
				
			||||||
 | 
					    "                file_dict = {\"subdir\": dir, \"filename\": file}\n",
 | 
				
			||||||
 | 
					    "                print(file)\n",
 | 
				
			||||||
 | 
					    "                full_address = wdirectory + \"/\" + dir + \"/\" + file\n",
 | 
				
			||||||
 | 
					    "                file_dict = get_readibility(full_address, file_dict)\n",
 | 
				
			||||||
 | 
					    "                writer.writerow(file_dict)"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "0e0a7b88-49b6-4053-84b8-f54f1c6536c0",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "generate_file('dwo_readability_contributing.csv', contributing_wd, \"contributing\")"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 ],
 | 
				
			||||||
 | 
					 "metadata": {
 | 
				
			||||||
 | 
					  "kernelspec": {
 | 
				
			||||||
 | 
					   "display_name": "Python 3 (ipykernel)",
 | 
				
			||||||
 | 
					   "language": "python",
 | 
				
			||||||
 | 
					   "name": "python3"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  "language_info": {
 | 
				
			||||||
 | 
					   "codemirror_mode": {
 | 
				
			||||||
 | 
					    "name": "ipython",
 | 
				
			||||||
 | 
					    "version": 3
 | 
				
			||||||
 | 
					   },
 | 
				
			||||||
 | 
					   "file_extension": ".py",
 | 
				
			||||||
 | 
					   "mimetype": "text/x-python",
 | 
				
			||||||
 | 
					   "name": "python",
 | 
				
			||||||
 | 
					   "nbconvert_exporter": "python",
 | 
				
			||||||
 | 
					   "pygments_lexer": "ipython3",
 | 
				
			||||||
 | 
					   "version": "3.13.1"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 },
 | 
				
			||||||
 | 
					 "nbformat": 4,
 | 
				
			||||||
 | 
					 "nbformat_minor": 5
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@ -0,0 +1,33 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					 "cells": [
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "76168c17-548e-4bf2-a1fa-6c0b6372262a",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": []
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 ],
 | 
				
			||||||
 | 
					 "metadata": {
 | 
				
			||||||
 | 
					  "kernelspec": {
 | 
				
			||||||
 | 
					   "display_name": "Python 3 (ipykernel)",
 | 
				
			||||||
 | 
					   "language": "python",
 | 
				
			||||||
 | 
					   "name": "python3"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  "language_info": {
 | 
				
			||||||
 | 
					   "codemirror_mode": {
 | 
				
			||||||
 | 
					    "name": "ipython",
 | 
				
			||||||
 | 
					    "version": 3
 | 
				
			||||||
 | 
					   },
 | 
				
			||||||
 | 
					   "file_extension": ".py",
 | 
				
			||||||
 | 
					   "mimetype": "text/x-python",
 | 
				
			||||||
 | 
					   "name": "python",
 | 
				
			||||||
 | 
					   "nbconvert_exporter": "python",
 | 
				
			||||||
 | 
					   "pygments_lexer": "ipython3",
 | 
				
			||||||
 | 
					   "version": "3.13.1"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 },
 | 
				
			||||||
 | 
					 "nbformat": 4,
 | 
				
			||||||
 | 
					 "nbformat_minor": 5
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										339
									
								
								text_analysis/.ipynb_checkpoints/topicModel-checkpoint.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										339
									
								
								text_analysis/.ipynb_checkpoints/topicModel-checkpoint.ipynb
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,339 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					 "cells": [
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "e09a84d6-cbd4-4a12-8e96-3775f734a262",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "import re\n",
 | 
				
			||||||
 | 
					    "import numpy as np\n",
 | 
				
			||||||
 | 
					    "import pandas as pd\n",
 | 
				
			||||||
 | 
					    "import glob\n",
 | 
				
			||||||
 | 
					    "import copy\n",
 | 
				
			||||||
 | 
					    "import csv\n",
 | 
				
			||||||
 | 
					    "from statistics import mean, median\n",
 | 
				
			||||||
 | 
					    "from strip_markdown import strip_markdown\n",
 | 
				
			||||||
 | 
					    "import joblib"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "9483091c-ac72-415c-932d-ac7cf7970789",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "import gensim\n",
 | 
				
			||||||
 | 
					    "import gensim.corpora as corpora\n",
 | 
				
			||||||
 | 
					    "from gensim.utils import simple_preprocess\n",
 | 
				
			||||||
 | 
					    "from gensim.models import CoherenceModel\n",
 | 
				
			||||||
 | 
					    "from gensim.models.phrases import Phrases\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "from sklearn.decomposition import LatentDirichletAllocation\n",
 | 
				
			||||||
 | 
					    "from sklearn.model_selection import GridSearchCV\n",
 | 
				
			||||||
 | 
					    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "from statistics import mode"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "3da6b590-875d-478d-aaaa-de020039c519",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "# spacy and nltk for lemmatization\n",
 | 
				
			||||||
 | 
					    "import nltk \n",
 | 
				
			||||||
 | 
					    "#nltk.download('stopwords')\n",
 | 
				
			||||||
 | 
					    "import spacy\n",
 | 
				
			||||||
 | 
					    "from nltk.corpus import stopwords\n",
 | 
				
			||||||
 | 
					    "from nltk.stem.wordnet import WordNetLemmatizer\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "stopwords = stopwords.words('english')"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "60c137ae-6fe9-4b03-b899-6141b1645d6b",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def metadata_for_file(file):\n",
 | 
				
			||||||
 | 
					    "    word_list = file.split()\n",
 | 
				
			||||||
 | 
					    "    word_count = len(word_list)\n",
 | 
				
			||||||
 | 
					    "    #print(word_list)\n",
 | 
				
			||||||
 | 
					    "    if word_count == 0:\n",
 | 
				
			||||||
 | 
					    "        avg_word_length = 0\n",
 | 
				
			||||||
 | 
					    "    else: \n",
 | 
				
			||||||
 | 
					    "        avg_word_length = sum(map(len, word_list))  / len(word_list)\n",
 | 
				
			||||||
 | 
					    "    #return number of paragraphs\n",
 | 
				
			||||||
 | 
					    "    return word_count, avg_word_length"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "2e674fef-adb4-48c9-86a0-a655c41a95f3",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def get_data_from_dir(directory):\n",
 | 
				
			||||||
 | 
					    "    files = glob.glob(f\"{directory}/*\")\n",
 | 
				
			||||||
 | 
					    "    data_list = []\n",
 | 
				
			||||||
 | 
					    "    word_counts = []\n",
 | 
				
			||||||
 | 
					    "    avg_word_lengths = []\n",
 | 
				
			||||||
 | 
					    "    file_list = []\n",
 | 
				
			||||||
 | 
					    "    for file in files:\n",
 | 
				
			||||||
 | 
					    "        text = open(file, encoding='utf-8').read()\n",
 | 
				
			||||||
 | 
					    "        #here's some of the descriptive text analysis\n",
 | 
				
			||||||
 | 
					    "        word_count, avg_word_length = metadata_for_file(text)\n",
 | 
				
			||||||
 | 
					    "        word_counts.append(word_count)\n",
 | 
				
			||||||
 | 
					    "        avg_word_lengths.append(avg_word_length)\n",
 | 
				
			||||||
 | 
					    "        #adding the data to the list of text\n",
 | 
				
			||||||
 | 
					    "        data_list.append(text)\n",
 | 
				
			||||||
 | 
					    "        #adding filename\n",
 | 
				
			||||||
 | 
					    "        file_list.append(file)\n",
 | 
				
			||||||
 | 
					    "    return data_list, word_counts, avg_word_lengths, file_list"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "2b332b10-bfc8-4566-8c52-19a8a334af00",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "#preprocessing text data\n",
 | 
				
			||||||
 | 
					    "def preprocess(corpus_list):\n",
 | 
				
			||||||
 | 
					    "    #extending stopwords \n",
 | 
				
			||||||
 | 
					    "    specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n",
 | 
				
			||||||
 | 
					    "    \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n",
 | 
				
			||||||
 | 
					    "    stopwords.extend(specific_stopwords)\n",
 | 
				
			||||||
 | 
					    "    D = copy.copy(corpus_list)\n",
 | 
				
			||||||
 | 
					    "    #stripping markdown from documents\n",
 | 
				
			||||||
 | 
					    "    D = [strip_markdown(doc) for doc in D]\n",
 | 
				
			||||||
 | 
					    "    #strip html \n",
 | 
				
			||||||
 | 
					    "    D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]\n",
 | 
				
			||||||
 | 
					    "    #mvp right now, can certainly be expanded as iterations of text analysis are done\n",
 | 
				
			||||||
 | 
					    "    D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n",
 | 
				
			||||||
 | 
					    "    lemmatizer = WordNetLemmatizer()\n",
 | 
				
			||||||
 | 
					    "    D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n",
 | 
				
			||||||
 | 
					    "    return D_lemma"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "#preparing processed data for model usage\n",
 | 
				
			||||||
 | 
					    "def text_preparation(lemmatized_text):\n",
 | 
				
			||||||
 | 
					    "    #bigrams\n",
 | 
				
			||||||
 | 
					    "    D_bigrams = copy.copy(lemmatized_text)\n",
 | 
				
			||||||
 | 
					    "    bigram = Phrases(D_bigrams, min_count=2)\n",
 | 
				
			||||||
 | 
					    "    for i in range(len(lemmatized_text)):\n",
 | 
				
			||||||
 | 
					    "        for token in bigram[D_bigrams[i]]:\n",
 | 
				
			||||||
 | 
					    "            if '_' in token:\n",
 | 
				
			||||||
 | 
					    "                D_bigrams[i].append(token)\n",
 | 
				
			||||||
 | 
					    "    #id2word\n",
 | 
				
			||||||
 | 
					    "    id2word = corpora.Dictionary(D_bigrams)\n",
 | 
				
			||||||
 | 
					    "    id2word.filter_extremes(no_below=5, no_above=0.5)\n",
 | 
				
			||||||
 | 
					    "    #bow representation \n",
 | 
				
			||||||
 | 
					    "    bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n",
 | 
				
			||||||
 | 
					    "    return bag_of_words, id2word"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "24799e25-2c0c-4e16-b503-68296f604f52",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def lda_model_identification(data_vectorized):\n",
 | 
				
			||||||
 | 
					    "    lda = LatentDirichletAllocation()\n",
 | 
				
			||||||
 | 
					    "    search_params = {'n_components': [TKTK], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256]  }\n",
 | 
				
			||||||
 | 
					    "    model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n",
 | 
				
			||||||
 | 
					    "    model.fit(data_vectorized)\n",
 | 
				
			||||||
 | 
					    "    best_lda_model = model.best_estimator_\n",
 | 
				
			||||||
 | 
					    "    print(\"Best Model's Params: \", model.best_params_)\n",
 | 
				
			||||||
 | 
					    "    print(\"Best Log Likelihood Score: \", model.best_score_)\n",
 | 
				
			||||||
 | 
					    "    print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def best_lda_model(data_vectorized, vocab):\n",
 | 
				
			||||||
 | 
					    "    lda = LatentDirichletAllocation(n_components=TKTK, learning_decay = TKTK, batch_size = TKTK, max_iter = TKTK)\n",
 | 
				
			||||||
 | 
					    "    id_topic = lda.fit_transform(data_vectorized)\n",
 | 
				
			||||||
 | 
					    "    topic_words = {}\n",
 | 
				
			||||||
 | 
					    "    for topic, comp in enumerate(lda.components_):\n",
 | 
				
			||||||
 | 
					    "        word_idx = np.argsort(comp)[::-1][:10]\n",
 | 
				
			||||||
 | 
					    "        topic_words[topic] = [vocab[i] for i in word_idx]\n",
 | 
				
			||||||
 | 
					    "    for topic, words in topic_words.items():\n",
 | 
				
			||||||
 | 
					    "        print('Topic: %d' % topic)\n",
 | 
				
			||||||
 | 
					    "        print('  %s' % ', '.join(words))\n",
 | 
				
			||||||
 | 
					    "    #lda.print_topics(num_words=10)\n",
 | 
				
			||||||
 | 
					    "    joblib.dump(lda, '020125_DOCTYPE_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    #lda = joblib.load('0509_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    return id_topic"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def get_most_prevalent(vect_documents, documents):\n",
 | 
				
			||||||
 | 
					    "    lda = joblib.load('TKTK_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    distributions = lda.transform(vect_documents)\n",
 | 
				
			||||||
 | 
					    "    most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"],  7: [0, \"\"]}\n",
 | 
				
			||||||
 | 
					    "    for i, topic_distribution in enumerate(distributions):\n",
 | 
				
			||||||
 | 
					    "        for j in range(8):\n",
 | 
				
			||||||
 | 
					    "            if topic_distribution[j] > most_prevalent[j][0]:\n",
 | 
				
			||||||
 | 
					    "                most_prevalent[j] = [topic_distribution[j], documents[i]]\n",
 | 
				
			||||||
 | 
					    "    print(most_prevalent)\n",
 | 
				
			||||||
 | 
					    "    return most_prevalent\n"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def prevalent_topics(vect_documents, file_list):\n",
 | 
				
			||||||
 | 
					    "    lda = joblib.load('TKTKTKTK_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    #lda = joblib.load('0514_contrib_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    distributions = lda.transform(vect_documents)\n",
 | 
				
			||||||
 | 
					    "    #figuring out what the max distribution is and then figuring out the mode\n",
 | 
				
			||||||
 | 
					    "    top_topic = []\n",
 | 
				
			||||||
 | 
					    "    count_of_multiple = 0\n",
 | 
				
			||||||
 | 
					    "    topic_arrays = []\n",
 | 
				
			||||||
 | 
					    "    for i, topic_distribution in enumerate(distributions):\n",
 | 
				
			||||||
 | 
					    "        max_dist = max(topic_distribution)\n",
 | 
				
			||||||
 | 
					    "        indexes = np.where(topic_distribution == max_dist)[0]\n",
 | 
				
			||||||
 | 
					    "        if len(indexes) == 1:\n",
 | 
				
			||||||
 | 
					    "            top_topic.append(indexes[0])\n",
 | 
				
			||||||
 | 
					    "        else:\n",
 | 
				
			||||||
 | 
					    "            count_of_multiple += 1\n",
 | 
				
			||||||
 | 
					    "        topic_arrays.append(topic_distribution)\n",
 | 
				
			||||||
 | 
					    "    most_frequent(top_topic)\n",
 | 
				
			||||||
 | 
					    "    print(count_of_multiple)\n",
 | 
				
			||||||
 | 
					    "    df = pd.DataFrame(topic_arrays)\n",
 | 
				
			||||||
 | 
					    "    #finding the distribution values for all documents\n",
 | 
				
			||||||
 | 
					    "    with open('readme_file_topic_distributions.csv', 'w', newline='') as csvfile:\n",
 | 
				
			||||||
 | 
					    "        fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7']\n",
 | 
				
			||||||
 | 
					    "        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
 | 
				
			||||||
 | 
					    "        writer.writeheader()\n",
 | 
				
			||||||
 | 
					    "        for i, row in df.iterrows():\n",
 | 
				
			||||||
 | 
					    "            project_dir =  {}\n",
 | 
				
			||||||
 | 
					    "            project_dir['filename'] = file_list[i].split(\"/\")[-1]\n",
 | 
				
			||||||
 | 
					    "            array_row = df.iloc[i].to_numpy()\n",
 | 
				
			||||||
 | 
					    "            for j in range(8):\n",
 | 
				
			||||||
 | 
					    "                project_dir[\"t\" + str(j)] = array_row[j]\n",
 | 
				
			||||||
 | 
					    "            writer.writerow(project_dir)\n",
 | 
				
			||||||
 | 
					    "    #print(df.sort_values(by=['0']).head(5))\n",
 | 
				
			||||||
 | 
					    "    '''\n",
 | 
				
			||||||
 | 
					    "    for i in range(8):\n",
 | 
				
			||||||
 | 
					    "        print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n",
 | 
				
			||||||
 | 
					    "        top5 = df.nlargest(10, i)\n",
 | 
				
			||||||
 | 
					    "        top_indices = top5.index.to_list()\n",
 | 
				
			||||||
 | 
					    "        print(top5)\n",
 | 
				
			||||||
 | 
					    "        for index in top_indices:\n",
 | 
				
			||||||
 | 
					    "            print(file_list[index])\n",
 | 
				
			||||||
 | 
					    "        bottom5 = df.nsmallest(10, i)\n",
 | 
				
			||||||
 | 
					    "        bottom_indices = bottom5.index.to_list()\n",
 | 
				
			||||||
 | 
					    "        print(bottom5)\n",
 | 
				
			||||||
 | 
					    "        for index in bottom_indices:\n",
 | 
				
			||||||
 | 
					    "            print(file_list[index])\n",
 | 
				
			||||||
 | 
					    "    '''\n",
 | 
				
			||||||
 | 
					    "    averages = df.mean()\n",
 | 
				
			||||||
 | 
					    "    print(averages)\n"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def most_frequent(topic_prevalence):\n",
 | 
				
			||||||
 | 
					    "    most_frequent_array = []\n",
 | 
				
			||||||
 | 
					    "    for j in range(4):\n",
 | 
				
			||||||
 | 
					    "        topic = mode(topic_prevalence)\n",
 | 
				
			||||||
 | 
					    "        most_frequent_array.append(topic)\n",
 | 
				
			||||||
 | 
					    "        topic_prevalence = [i for i in topic_prevalence if i != topic]\n",
 | 
				
			||||||
 | 
					    "    print(most_frequent_array)"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "1f937c2e-2714-475d-b670-602164c46642",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(readme_directory)\n",
 | 
				
			||||||
 | 
					    "print(\"Mean wordcount: \", mean(wordcounts))\n",
 | 
				
			||||||
 | 
					    "print(\"Median wordcount: \", median(wordcounts))\n",
 | 
				
			||||||
 | 
					    "print(\"Mean wordlength: \", mean(wordlengths))\n",
 | 
				
			||||||
 | 
					    "print(\"Median wordlength: \", median(wordlengths))\n",
 | 
				
			||||||
 | 
					    "lemmatized_corpus = preprocess(listed_corpus)"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "e90e236f-8db5-40cc-88a3-60e674b9d1de",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "vectorizer = CountVectorizer(analyzer='word',       \n",
 | 
				
			||||||
 | 
					    "                         min_df=2,                        \n",
 | 
				
			||||||
 | 
					    "                         stop_words='english',             \n",
 | 
				
			||||||
 | 
					    "                         lowercase=True,                   \n",
 | 
				
			||||||
 | 
					    "                         token_pattern='[a-zA-Z0-9]{2,}',  \n",
 | 
				
			||||||
 | 
					    "                        )\n",
 | 
				
			||||||
 | 
					    "data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n",
 | 
				
			||||||
 | 
					    "joblib.dump(vectorizer, '020125_DOCTYPE_vectorizer.joblib'"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 ],
 | 
				
			||||||
 | 
					 "metadata": {
 | 
				
			||||||
 | 
					  "kernelspec": {
 | 
				
			||||||
 | 
					   "display_name": "Python 3 (ipykernel)",
 | 
				
			||||||
 | 
					   "language": "python",
 | 
				
			||||||
 | 
					   "name": "python3"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  "language_info": {
 | 
				
			||||||
 | 
					   "codemirror_mode": {
 | 
				
			||||||
 | 
					    "name": "ipython",
 | 
				
			||||||
 | 
					    "version": 3
 | 
				
			||||||
 | 
					   },
 | 
				
			||||||
 | 
					   "file_extension": ".py",
 | 
				
			||||||
 | 
					   "mimetype": "text/x-python",
 | 
				
			||||||
 | 
					   "name": "python",
 | 
				
			||||||
 | 
					   "nbconvert_exporter": "python",
 | 
				
			||||||
 | 
					   "pygments_lexer": "ipython3",
 | 
				
			||||||
 | 
					   "version": "3.13.1"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 },
 | 
				
			||||||
 | 
					 "nbformat": 4,
 | 
				
			||||||
 | 
					 "nbformat_minor": 5
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										112
									
								
								text_analysis/partitioned_readability.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								text_analysis/partitioned_readability.ipynb
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,112 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					 "cells": [
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "f4c4796f-d109-472d-8f9c-95c6ec85f757",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "import os \n",
 | 
				
			||||||
 | 
					    "import textstat\n",
 | 
				
			||||||
 | 
					    "import csv"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "8f1f2fce-2335-4ee3-81f2-55822e2f63f9",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "readme_wd = \"\"\n",
 | 
				
			||||||
 | 
					    "contributing_wd = \"\"\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "csv_fieldnames = ['subdir', 'filename', 'flesch_reading_ease', 'flesch_kincaid_grade', 'linsear_write_formula', 'dale_chall_readability_score', 'mcalpine_eflaw', 'reading_time', 'char_count', 'word_count']"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "a0d3b5b1-ae97-4a46-95e0-92232c46c2fa",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "'''\n",
 | 
				
			||||||
 | 
					    "gets the 3 readability scores for each individual textfile\n",
 | 
				
			||||||
 | 
					    "'''\n",
 | 
				
			||||||
 | 
					    "def get_readibility(file_address, file_dict):\n",
 | 
				
			||||||
 | 
					    "    file = open(file_address, \"r\")\n",
 | 
				
			||||||
 | 
					    "    document = file.read()\n",
 | 
				
			||||||
 | 
					    "    file_dict['flesch_reading_ease'] = textstat.flesch_reading_ease(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['linsear_write_formula'] = textstat.linsear_write_formula(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['dale_chall_readability_score'] = textstat.dale_chall_readability_score(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['mcalpine_eflaw'] = textstat.mcalpine_eflaw(document)\n",
 | 
				
			||||||
 | 
					    "    file_dict['reading_time']  = textstat.reading_time(document, ms_per_char=14.69)\n",
 | 
				
			||||||
 | 
					    "    file_dict['char_count'] = textstat.char_count(document, ignore_spaces=True)\n",
 | 
				
			||||||
 | 
					    "    file_dict['word_count'] = textstat.lexicon_count(document, removepunct=True)\n",
 | 
				
			||||||
 | 
					    "    return file_dict\n"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "8b3c481e-c521-4e1d-926e-88f4b75ae7de",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "'''\n",
 | 
				
			||||||
 | 
					    "getting readability scoring for each type of document\n",
 | 
				
			||||||
 | 
					    "'''\n",
 | 
				
			||||||
 | 
					    "def generate_file(output_csv, wdirectory, document_type):\n",
 | 
				
			||||||
 | 
					    "    with open(output_csv, 'w') as csvfile: \n",
 | 
				
			||||||
 | 
					    "        writer = csv.DictWriter(csvfile, fieldnames = csv_fieldnames) \n",
 | 
				
			||||||
 | 
					    "        writer.writeheader() \n",
 | 
				
			||||||
 | 
					    "        subdirs = os.listdir(wdirectory)\n",
 | 
				
			||||||
 | 
					    "        print(document_type)\n",
 | 
				
			||||||
 | 
					    "        for dir in subdirs: \n",
 | 
				
			||||||
 | 
					    "            print(dir)\n",
 | 
				
			||||||
 | 
					    "            files = os.listdir(wdirectory + \"/\" + dir)\n",
 | 
				
			||||||
 | 
					    "            count = 0\n",
 | 
				
			||||||
 | 
					    "            for file in files:\n",
 | 
				
			||||||
 | 
					    "                file_dict = {\"subdir\": dir, \"filename\": file}\n",
 | 
				
			||||||
 | 
					    "                print(file)\n",
 | 
				
			||||||
 | 
					    "                full_address = wdirectory + \"/\" + dir + \"/\" + file\n",
 | 
				
			||||||
 | 
					    "                file_dict = get_readibility(full_address, file_dict)\n",
 | 
				
			||||||
 | 
					    "                writer.writerow(file_dict)"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "0e0a7b88-49b6-4053-84b8-f54f1c6536c0",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "generate_file('dwo_readability_contributing.csv', contributing_wd, \"contributing\")"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 ],
 | 
				
			||||||
 | 
					 "metadata": {
 | 
				
			||||||
 | 
					  "kernelspec": {
 | 
				
			||||||
 | 
					   "display_name": "Python 3 (ipykernel)",
 | 
				
			||||||
 | 
					   "language": "python",
 | 
				
			||||||
 | 
					   "name": "python3"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  "language_info": {
 | 
				
			||||||
 | 
					   "codemirror_mode": {
 | 
				
			||||||
 | 
					    "name": "ipython",
 | 
				
			||||||
 | 
					    "version": 3
 | 
				
			||||||
 | 
					   },
 | 
				
			||||||
 | 
					   "file_extension": ".py",
 | 
				
			||||||
 | 
					   "mimetype": "text/x-python",
 | 
				
			||||||
 | 
					   "name": "python",
 | 
				
			||||||
 | 
					   "nbconvert_exporter": "python",
 | 
				
			||||||
 | 
					   "pygments_lexer": "ipython3",
 | 
				
			||||||
 | 
					   "version": "3.13.1"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 },
 | 
				
			||||||
 | 
					 "nbformat": 4,
 | 
				
			||||||
 | 
					 "nbformat_minor": 5
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										33
									
								
								text_analysis/partitioning_dirs.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								text_analysis/partitioning_dirs.ipynb
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,33 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					 "cells": [
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "76168c17-548e-4bf2-a1fa-6c0b6372262a",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": []
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 ],
 | 
				
			||||||
 | 
					 "metadata": {
 | 
				
			||||||
 | 
					  "kernelspec": {
 | 
				
			||||||
 | 
					   "display_name": "Python 3 (ipykernel)",
 | 
				
			||||||
 | 
					   "language": "python",
 | 
				
			||||||
 | 
					   "name": "python3"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  "language_info": {
 | 
				
			||||||
 | 
					   "codemirror_mode": {
 | 
				
			||||||
 | 
					    "name": "ipython",
 | 
				
			||||||
 | 
					    "version": 3
 | 
				
			||||||
 | 
					   },
 | 
				
			||||||
 | 
					   "file_extension": ".py",
 | 
				
			||||||
 | 
					   "mimetype": "text/x-python",
 | 
				
			||||||
 | 
					   "name": "python",
 | 
				
			||||||
 | 
					   "nbconvert_exporter": "python",
 | 
				
			||||||
 | 
					   "pygments_lexer": "ipython3",
 | 
				
			||||||
 | 
					   "version": "3.13.1"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 },
 | 
				
			||||||
 | 
					 "nbformat": 4,
 | 
				
			||||||
 | 
					 "nbformat_minor": 5
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										339
									
								
								text_analysis/topicModel.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										339
									
								
								text_analysis/topicModel.ipynb
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,339 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					 "cells": [
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "e09a84d6-cbd4-4a12-8e96-3775f734a262",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "import re\n",
 | 
				
			||||||
 | 
					    "import numpy as np\n",
 | 
				
			||||||
 | 
					    "import pandas as pd\n",
 | 
				
			||||||
 | 
					    "import glob\n",
 | 
				
			||||||
 | 
					    "import copy\n",
 | 
				
			||||||
 | 
					    "import csv\n",
 | 
				
			||||||
 | 
					    "from statistics import mean, median\n",
 | 
				
			||||||
 | 
					    "from strip_markdown import strip_markdown\n",
 | 
				
			||||||
 | 
					    "import joblib"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "9483091c-ac72-415c-932d-ac7cf7970789",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "import gensim\n",
 | 
				
			||||||
 | 
					    "import gensim.corpora as corpora\n",
 | 
				
			||||||
 | 
					    "from gensim.utils import simple_preprocess\n",
 | 
				
			||||||
 | 
					    "from gensim.models import CoherenceModel\n",
 | 
				
			||||||
 | 
					    "from gensim.models.phrases import Phrases\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "from sklearn.decomposition import LatentDirichletAllocation\n",
 | 
				
			||||||
 | 
					    "from sklearn.model_selection import GridSearchCV\n",
 | 
				
			||||||
 | 
					    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "from statistics import mode"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "3da6b590-875d-478d-aaaa-de020039c519",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "# spacy and nltk for lemmatization\n",
 | 
				
			||||||
 | 
					    "import nltk \n",
 | 
				
			||||||
 | 
					    "#nltk.download('stopwords')\n",
 | 
				
			||||||
 | 
					    "import spacy\n",
 | 
				
			||||||
 | 
					    "from nltk.corpus import stopwords\n",
 | 
				
			||||||
 | 
					    "from nltk.stem.wordnet import WordNetLemmatizer\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "stopwords = stopwords.words('english')"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "60c137ae-6fe9-4b03-b899-6141b1645d6b",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def metadata_for_file(file):\n",
 | 
				
			||||||
 | 
					    "    word_list = file.split()\n",
 | 
				
			||||||
 | 
					    "    word_count = len(word_list)\n",
 | 
				
			||||||
 | 
					    "    #print(word_list)\n",
 | 
				
			||||||
 | 
					    "    if word_count == 0:\n",
 | 
				
			||||||
 | 
					    "        avg_word_length = 0\n",
 | 
				
			||||||
 | 
					    "    else: \n",
 | 
				
			||||||
 | 
					    "        avg_word_length = sum(map(len, word_list))  / len(word_list)\n",
 | 
				
			||||||
 | 
					    "    #return number of paragraphs\n",
 | 
				
			||||||
 | 
					    "    return word_count, avg_word_length"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "2e674fef-adb4-48c9-86a0-a655c41a95f3",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def get_data_from_dir(directory):\n",
 | 
				
			||||||
 | 
					    "    files = glob.glob(f\"{directory}/*\")\n",
 | 
				
			||||||
 | 
					    "    data_list = []\n",
 | 
				
			||||||
 | 
					    "    word_counts = []\n",
 | 
				
			||||||
 | 
					    "    avg_word_lengths = []\n",
 | 
				
			||||||
 | 
					    "    file_list = []\n",
 | 
				
			||||||
 | 
					    "    for file in files:\n",
 | 
				
			||||||
 | 
					    "        text = open(file, encoding='utf-8').read()\n",
 | 
				
			||||||
 | 
					    "        #here's some of the descriptive text analysis\n",
 | 
				
			||||||
 | 
					    "        word_count, avg_word_length = metadata_for_file(text)\n",
 | 
				
			||||||
 | 
					    "        word_counts.append(word_count)\n",
 | 
				
			||||||
 | 
					    "        avg_word_lengths.append(avg_word_length)\n",
 | 
				
			||||||
 | 
					    "        #adding the data to the list of text\n",
 | 
				
			||||||
 | 
					    "        data_list.append(text)\n",
 | 
				
			||||||
 | 
					    "        #adding filename\n",
 | 
				
			||||||
 | 
					    "        file_list.append(file)\n",
 | 
				
			||||||
 | 
					    "    return data_list, word_counts, avg_word_lengths, file_list"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "2b332b10-bfc8-4566-8c52-19a8a334af00",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "#preprocessing text data\n",
 | 
				
			||||||
 | 
					    "def preprocess(corpus_list):\n",
 | 
				
			||||||
 | 
					    "    #extending stopwords \n",
 | 
				
			||||||
 | 
					    "    specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n",
 | 
				
			||||||
 | 
					    "    \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n",
 | 
				
			||||||
 | 
					    "    stopwords.extend(specific_stopwords)\n",
 | 
				
			||||||
 | 
					    "    D = copy.copy(corpus_list)\n",
 | 
				
			||||||
 | 
					    "    #stripping markdown from documents\n",
 | 
				
			||||||
 | 
					    "    D = [strip_markdown(doc) for doc in D]\n",
 | 
				
			||||||
 | 
					    "    #strip html \n",
 | 
				
			||||||
 | 
					    "    D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]\n",
 | 
				
			||||||
 | 
					    "    #mvp right now, can certainly be expanded as iterations of text analysis are done\n",
 | 
				
			||||||
 | 
					    "    D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n",
 | 
				
			||||||
 | 
					    "    lemmatizer = WordNetLemmatizer()\n",
 | 
				
			||||||
 | 
					    "    D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n",
 | 
				
			||||||
 | 
					    "    return D_lemma"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "#preparing processed data for model usage\n",
 | 
				
			||||||
 | 
					    "def text_preparation(lemmatized_text):\n",
 | 
				
			||||||
 | 
					    "    #bigrams\n",
 | 
				
			||||||
 | 
					    "    D_bigrams = copy.copy(lemmatized_text)\n",
 | 
				
			||||||
 | 
					    "    bigram = Phrases(D_bigrams, min_count=2)\n",
 | 
				
			||||||
 | 
					    "    for i in range(len(lemmatized_text)):\n",
 | 
				
			||||||
 | 
					    "        for token in bigram[D_bigrams[i]]:\n",
 | 
				
			||||||
 | 
					    "            if '_' in token:\n",
 | 
				
			||||||
 | 
					    "                D_bigrams[i].append(token)\n",
 | 
				
			||||||
 | 
					    "    #id2word\n",
 | 
				
			||||||
 | 
					    "    id2word = corpora.Dictionary(D_bigrams)\n",
 | 
				
			||||||
 | 
					    "    id2word.filter_extremes(no_below=5, no_above=0.5)\n",
 | 
				
			||||||
 | 
					    "    #bow representation \n",
 | 
				
			||||||
 | 
					    "    bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n",
 | 
				
			||||||
 | 
					    "    return bag_of_words, id2word"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "24799e25-2c0c-4e16-b503-68296f604f52",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def lda_model_identification(data_vectorized):\n",
 | 
				
			||||||
 | 
					    "    lda = LatentDirichletAllocation()\n",
 | 
				
			||||||
 | 
					    "    search_params = {'n_components': [TKTK], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256]  }\n",
 | 
				
			||||||
 | 
					    "    model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n",
 | 
				
			||||||
 | 
					    "    model.fit(data_vectorized)\n",
 | 
				
			||||||
 | 
					    "    best_lda_model = model.best_estimator_\n",
 | 
				
			||||||
 | 
					    "    print(\"Best Model's Params: \", model.best_params_)\n",
 | 
				
			||||||
 | 
					    "    print(\"Best Log Likelihood Score: \", model.best_score_)\n",
 | 
				
			||||||
 | 
					    "    print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def best_lda_model(data_vectorized, vocab):\n",
 | 
				
			||||||
 | 
					    "    lda = LatentDirichletAllocation(n_components=TKTK, learning_decay = TKTK, batch_size = TKTK, max_iter = TKTK)\n",
 | 
				
			||||||
 | 
					    "    id_topic = lda.fit_transform(data_vectorized)\n",
 | 
				
			||||||
 | 
					    "    topic_words = {}\n",
 | 
				
			||||||
 | 
					    "    for topic, comp in enumerate(lda.components_):\n",
 | 
				
			||||||
 | 
					    "        word_idx = np.argsort(comp)[::-1][:10]\n",
 | 
				
			||||||
 | 
					    "        topic_words[topic] = [vocab[i] for i in word_idx]\n",
 | 
				
			||||||
 | 
					    "    for topic, words in topic_words.items():\n",
 | 
				
			||||||
 | 
					    "        print('Topic: %d' % topic)\n",
 | 
				
			||||||
 | 
					    "        print('  %s' % ', '.join(words))\n",
 | 
				
			||||||
 | 
					    "    #lda.print_topics(num_words=10)\n",
 | 
				
			||||||
 | 
					    "    joblib.dump(lda, '020125_DOCTYPE_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    #lda = joblib.load('0509_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    return id_topic"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def get_most_prevalent(vect_documents, documents):\n",
 | 
				
			||||||
 | 
					    "    lda = joblib.load('TKTK_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    distributions = lda.transform(vect_documents)\n",
 | 
				
			||||||
 | 
					    "    most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"],  7: [0, \"\"]}\n",
 | 
				
			||||||
 | 
					    "    for i, topic_distribution in enumerate(distributions):\n",
 | 
				
			||||||
 | 
					    "        for j in range(8):\n",
 | 
				
			||||||
 | 
					    "            if topic_distribution[j] > most_prevalent[j][0]:\n",
 | 
				
			||||||
 | 
					    "                most_prevalent[j] = [topic_distribution[j], documents[i]]\n",
 | 
				
			||||||
 | 
					    "    print(most_prevalent)\n",
 | 
				
			||||||
 | 
					    "    return most_prevalent\n"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def prevalent_topics(vect_documents, file_list):\n",
 | 
				
			||||||
 | 
					    "    lda = joblib.load('TKTKTKTK_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    #lda = joblib.load('0514_contrib_lda.jl')\n",
 | 
				
			||||||
 | 
					    "    distributions = lda.transform(vect_documents)\n",
 | 
				
			||||||
 | 
					    "    #figuring out what the max distribution is and then figuring out the mode\n",
 | 
				
			||||||
 | 
					    "    top_topic = []\n",
 | 
				
			||||||
 | 
					    "    count_of_multiple = 0\n",
 | 
				
			||||||
 | 
					    "    topic_arrays = []\n",
 | 
				
			||||||
 | 
					    "    for i, topic_distribution in enumerate(distributions):\n",
 | 
				
			||||||
 | 
					    "        max_dist = max(topic_distribution)\n",
 | 
				
			||||||
 | 
					    "        indexes = np.where(topic_distribution == max_dist)[0]\n",
 | 
				
			||||||
 | 
					    "        if len(indexes) == 1:\n",
 | 
				
			||||||
 | 
					    "            top_topic.append(indexes[0])\n",
 | 
				
			||||||
 | 
					    "        else:\n",
 | 
				
			||||||
 | 
					    "            count_of_multiple += 1\n",
 | 
				
			||||||
 | 
					    "        topic_arrays.append(topic_distribution)\n",
 | 
				
			||||||
 | 
					    "    most_frequent(top_topic)\n",
 | 
				
			||||||
 | 
					    "    print(count_of_multiple)\n",
 | 
				
			||||||
 | 
					    "    df = pd.DataFrame(topic_arrays)\n",
 | 
				
			||||||
 | 
					    "    #finding the distribution values for all documents\n",
 | 
				
			||||||
 | 
					    "    with open('readme_file_topic_distributions.csv', 'w', newline='') as csvfile:\n",
 | 
				
			||||||
 | 
					    "        fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7']\n",
 | 
				
			||||||
 | 
					    "        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
 | 
				
			||||||
 | 
					    "        writer.writeheader()\n",
 | 
				
			||||||
 | 
					    "        for i, row in df.iterrows():\n",
 | 
				
			||||||
 | 
					    "            project_dir =  {}\n",
 | 
				
			||||||
 | 
					    "            project_dir['filename'] = file_list[i].split(\"/\")[-1]\n",
 | 
				
			||||||
 | 
					    "            array_row = df.iloc[i].to_numpy()\n",
 | 
				
			||||||
 | 
					    "            for j in range(8):\n",
 | 
				
			||||||
 | 
					    "                project_dir[\"t\" + str(j)] = array_row[j]\n",
 | 
				
			||||||
 | 
					    "            writer.writerow(project_dir)\n",
 | 
				
			||||||
 | 
					    "    #print(df.sort_values(by=['0']).head(5))\n",
 | 
				
			||||||
 | 
					    "    '''\n",
 | 
				
			||||||
 | 
					    "    for i in range(8):\n",
 | 
				
			||||||
 | 
					    "        print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n",
 | 
				
			||||||
 | 
					    "        top5 = df.nlargest(10, i)\n",
 | 
				
			||||||
 | 
					    "        top_indices = top5.index.to_list()\n",
 | 
				
			||||||
 | 
					    "        print(top5)\n",
 | 
				
			||||||
 | 
					    "        for index in top_indices:\n",
 | 
				
			||||||
 | 
					    "            print(file_list[index])\n",
 | 
				
			||||||
 | 
					    "        bottom5 = df.nsmallest(10, i)\n",
 | 
				
			||||||
 | 
					    "        bottom_indices = bottom5.index.to_list()\n",
 | 
				
			||||||
 | 
					    "        print(bottom5)\n",
 | 
				
			||||||
 | 
					    "        for index in bottom_indices:\n",
 | 
				
			||||||
 | 
					    "            print(file_list[index])\n",
 | 
				
			||||||
 | 
					    "    '''\n",
 | 
				
			||||||
 | 
					    "    averages = df.mean()\n",
 | 
				
			||||||
 | 
					    "    print(averages)\n"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "def most_frequent(topic_prevalence):\n",
 | 
				
			||||||
 | 
					    "    most_frequent_array = []\n",
 | 
				
			||||||
 | 
					    "    for j in range(4):\n",
 | 
				
			||||||
 | 
					    "        topic = mode(topic_prevalence)\n",
 | 
				
			||||||
 | 
					    "        most_frequent_array.append(topic)\n",
 | 
				
			||||||
 | 
					    "        topic_prevalence = [i for i in topic_prevalence if i != topic]\n",
 | 
				
			||||||
 | 
					    "    print(most_frequent_array)"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "1f937c2e-2714-475d-b670-602164c46642",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(readme_directory)\n",
 | 
				
			||||||
 | 
					    "print(\"Mean wordcount: \", mean(wordcounts))\n",
 | 
				
			||||||
 | 
					    "print(\"Median wordcount: \", median(wordcounts))\n",
 | 
				
			||||||
 | 
					    "print(\"Mean wordlength: \", mean(wordlengths))\n",
 | 
				
			||||||
 | 
					    "print(\"Median wordlength: \", median(wordlengths))\n",
 | 
				
			||||||
 | 
					    "lemmatized_corpus = preprocess(listed_corpus)"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "id": "e90e236f-8db5-40cc-88a3-60e674b9d1de",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "vectorizer = CountVectorizer(analyzer='word',       \n",
 | 
				
			||||||
 | 
					    "                         min_df=2,                        \n",
 | 
				
			||||||
 | 
					    "                         stop_words='english',             \n",
 | 
				
			||||||
 | 
					    "                         lowercase=True,                   \n",
 | 
				
			||||||
 | 
					    "                         token_pattern='[a-zA-Z0-9]{2,}',  \n",
 | 
				
			||||||
 | 
					    "                        )\n",
 | 
				
			||||||
 | 
					    "data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n",
 | 
				
			||||||
 | 
					    "joblib.dump(vectorizer, '020125_DOCTYPE_vectorizer.joblib'"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 ],
 | 
				
			||||||
 | 
					 "metadata": {
 | 
				
			||||||
 | 
					  "kernelspec": {
 | 
				
			||||||
 | 
					   "display_name": "Python 3 (ipykernel)",
 | 
				
			||||||
 | 
					   "language": "python",
 | 
				
			||||||
 | 
					   "name": "python3"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  "language_info": {
 | 
				
			||||||
 | 
					   "codemirror_mode": {
 | 
				
			||||||
 | 
					    "name": "ipython",
 | 
				
			||||||
 | 
					    "version": 3
 | 
				
			||||||
 | 
					   },
 | 
				
			||||||
 | 
					   "file_extension": ".py",
 | 
				
			||||||
 | 
					   "mimetype": "text/x-python",
 | 
				
			||||||
 | 
					   "name": "python",
 | 
				
			||||||
 | 
					   "nbconvert_exporter": "python",
 | 
				
			||||||
 | 
					   "pygments_lexer": "ipython3",
 | 
				
			||||||
 | 
					   "version": "3.13.1"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 },
 | 
				
			||||||
 | 
					 "nbformat": 4,
 | 
				
			||||||
 | 
					 "nbformat_minor": 5
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user