diff --git a/cleaning_scripts/get_weekly_commit_counts.R b/cleaning_scripts/get_weekly_commit_counts.R index dae4052..0a631d1 100644 --- a/cleaning_scripts/get_weekly_commit_counts.R +++ b/cleaning_scripts/get_weekly_commit_counts.R @@ -5,12 +5,12 @@ library(dplyr) library(lubridate) #for a given file we want to get the count data and produce a csv -readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/13125_test_README_publication_commits.csv" -contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/13125_test_CONTRIBUTING_publication_commits.csv" -readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/readme/" -contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/" +readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/README_publication_commits.csv" +contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/CONTRIBUTING_publication_commits.csv" +readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/readme/" +contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/contributing/" -test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv" +#test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv" transform_commit_data <- function(filepath, ref_df){ #basic, loading in the file @@ -31,7 +31,7 @@ transform_commit_data <- function(filepath, ref_df){ #find the publication entry, in the specified df matched_entry <- ref_df |> filter(repo_id == project_id) - commit_date <- as.Date(matched_entry$commit_date) + commit_date <- min(as.Date(matched_entry$commit_date)) #get information about project age either in the "present" #or at the time of first commit @@ -134,16 +134,15 @@ transform_directory_of_commit_data <- function(is_readme) { } #below is for contributing file -test_big_df <- transform_directory_of_commit_data(is_readme=FALSE) -output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/013125_weekly_count_CONTRIBUTING.csv" +#test_big_df <- transform_directory_of_commit_data(is_readme=FALSE) +#output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" #below is for readme -#test_big_df <- transform_directory_of_commit_data(is_readme=TRUE) -#output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/013125_weekly_count_README.csv" - +big_df <- transform_directory_of_commit_data(is_readme=TRUE) +output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" #validation testing -#length(unique(test_big_df$project_id)) +length(unique(big_df$project_id)) #filtered_df <- test_big_df %>% # filter(commit_count != 0, new_author_emails == 0, new_committer_emails == 0) #another graceful exit -#test_big_df.to_csv(output_filepath, index=False) \ No newline at end of file +write.csv(big_df, output_filepath, row.names = FALSE) diff --git a/mlm/contributing_did_model_fit.R b/mlm/contributing_did_model_fit.R new file mode 100644 index 0000000..1952b16 --- /dev/null +++ b/mlm/contributing_did_model_fit.R @@ -0,0 +1,24 @@ +library(dplyr) +library(lubridate) +library(rdd) + +contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" +contributing_df = read.csv(contributing_df_filepath, header = TRUE) + +window_num <- 5 +contributing_df <- contributing_df |> + filter(week_index >= (- window_num) & week_index <= (window_num)) |> + mutate(scaled_age = scale(age)) |> + mutate(scaled_age_at_commit = scale(age_at_commit))|> + mutate(log1p_count = log1p(commit_count)) + +library(lme4) +library(optimx) +library(lattice) + +all_gmodel <- glmer.nb(log1p_count ~ before_after * week_index + scaled_age + (before_after * week_index | project_id), + control=glmerControl(optimizer="bobyqa", + optCtrl=list(maxfun=2e5)), nAGQ=0, + data=contributing_df) + +summary(all_gmodel) diff --git a/mlm/contributing_did_prep.R b/mlm/contributing_did_prep.R new file mode 100644 index 0000000..7ffc6e4 --- /dev/null +++ b/mlm/contributing_did_prep.R @@ -0,0 +1,30 @@ +library(dplyr) +library(lubridate) +library(rdd) + +contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" +df = read.csv(contributing_df_filepath, header = TRUE) + +#EDA +var(df$commit_count) # 325.5261 +mean(df$commit_count) # 7.743385 +median(df$commit_count) # 1 +mean(df$age) # 4838.649 days +mean(df$age_at_commit) # 2141.996 days +median(df$age) # 4597 days +median(df$age_at_commit) # 1603 days + +# scale and log-transform +df$scaled_age <- scale(df$age) +df$scaled_age_at_commit <- scale(df$age_at_commit) +df$log1p_count <- log1p(df$commit_count) + +#getting IK Bandwidth +get_optimal_bandwidth <- function(df){ + IKbandwidth(df$week_index, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular") +} + +mean_optimal_bandwidth <- df %>% + group_by(project_id) %>% + summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>% + summarise(mean_optimal_bandwidth = mean(optimal_bandwidth)) \ No newline at end of file diff --git a/mlm/gam_plot.R b/mlm/gam_plot.R new file mode 100644 index 0000000..bdc4a88 --- /dev/null +++ b/mlm/gam_plot.R @@ -0,0 +1,44 @@ + +contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" +contributing_df = read.csv(contributing_df_filepath, header = TRUE) + +readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" +readme_df = read.csv(readme_df_filepath, header = TRUE) + +window_num <- 5 +contributing_df <- contributing_df |> + filter(week_index >= (- window_num) & week_index <= (window_num)) |> + mutate(doc_type = "CONTRIBUTING") + +readme_df <- readme_df |> + filter(week_index >= (- window_num) & week_index <= (window_num)) |> + mutate(doc_type = "README") + +main_df <- rbind(contributing_df, readme_df) +main_df$log1p_count <- log1p(main_df$commit_count) + +library(scales) +library(ggplot2) + +expm1_trans <- trans_new( + name = 'expm1', + transform = function(x) expm1(x), + inverse = function(x) log1p(x) +) + +doctypeColors <- + setNames( c('#5da2d8', '#c7756a') + , c("CONTRIBUTING", "README")) + +time_plot <- main_df |> + ggplot(aes(x=week_index, y=commit_count, color=factor(doc_type))) + + scale_y_continuous(trans = 'log1p', labels = scales::comma) + + labs(x="Weekly Offset", y="Commit Count", color="Document Type: ") + + scale_color_manual(values = doctypeColors) + + geom_smooth() + + geom_vline(xintercept = 0)+ + theme_bw() + + theme(legend.position = "top") +time_plot + +#ggsave(filename = "plots/cr-020225-gam-introduction.png", plot = time_plot, width = 8, height = 6, dpi = 700) diff --git a/mlm/plots/cr-020225-gam-introduction.png b/mlm/plots/cr-020225-gam-introduction.png new file mode 100644 index 0000000..7b5226a Binary files /dev/null and b/mlm/plots/cr-020225-gam-introduction.png differ diff --git a/mlm/readme_did_model_fit.R b/mlm/readme_did_model_fit.R new file mode 100644 index 0000000..e69de29 diff --git a/mlm/readme_did_prep.R b/mlm/readme_did_prep.R new file mode 100644 index 0000000..461c9cb --- /dev/null +++ b/mlm/readme_did_prep.R @@ -0,0 +1,34 @@ +library(tidyverse) +library(dplyr) +library(lubridate) +library(rdd) + +readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" +df = read.csv(readme_df_filepath, header = TRUE) + +#EDA +var(df$commit_count) # 112.4945 +mean(df$commit_count) # 2.431342 +median(df$commit_count) # 0 +mean(df$age) # 4911.734 days +mean(df$age_at_commit) # 197.296 days +median(df$age) # 4689 days +median(df$age_at_commit) # 0 days + +# scale and log-transform +df$scaled_age <- scale(df$age) +df$scaled_age_at_commit <- scale(df$age_at_commit) +df$log1p_count <- log1p(df$commit_count) + + +#getting IK Bandwidth +get_optimal_bandwidth <- function(df){ + IKbandwidth(df$week_index, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular") +} + +mean_optimal_bandwidth <- df %>% + group_by(project_id) %>% + summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>% + summarise(mean_optimal_bandwidth = mean(optimal_bandwidth)) + +#Mean Optimal Bandwidth: 5.44841 diff --git a/rstudio-server.job b/rstudio-server.job index 3a9c9ac..2feccea 100644 --- a/rstudio-server.job +++ b/rstudio-server.job @@ -3,7 +3,7 @@ #SBATCH --job-name=mg-govdoc-cr #SBATCH --partition=cpu-g2-mem2x #update this line - use hyakalloc to find partitions you can use -#SBATCH --time=04:00:00 +#SBATCH --time=05:00:00 #SBATCH --nodes=1 #SBATCH --ntasks=4 #SBATCH --mem=64G diff --git a/text_analysis/.ipynb_checkpoints/partitioned_readability-checkpoint.ipynb b/text_analysis/.ipynb_checkpoints/partitioned_readability-checkpoint.ipynb new file mode 100644 index 0000000..0bd6566 --- /dev/null +++ b/text_analysis/.ipynb_checkpoints/partitioned_readability-checkpoint.ipynb @@ -0,0 +1,112 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f4c4796f-d109-472d-8f9c-95c6ec85f757", + "metadata": {}, + "outputs": [], + "source": [ + "import os \n", + "import textstat\n", + "import csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f1f2fce-2335-4ee3-81f2-55822e2f63f9", + "metadata": {}, + "outputs": [], + "source": [ + "readme_wd = \"\"\n", + "contributing_wd = \"\"\n", + "\n", + "csv_fieldnames = ['subdir', 'filename', 'flesch_reading_ease', 'flesch_kincaid_grade', 'linsear_write_formula', 'dale_chall_readability_score', 'mcalpine_eflaw', 'reading_time', 'char_count', 'word_count']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0d3b5b1-ae97-4a46-95e0-92232c46c2fa", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "gets the 3 readability scores for each individual textfile\n", + "'''\n", + "def get_readibility(file_address, file_dict):\n", + " file = open(file_address, \"r\")\n", + " document = file.read()\n", + " file_dict['flesch_reading_ease'] = textstat.flesch_reading_ease(document)\n", + " file_dict['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(document)\n", + " file_dict['linsear_write_formula'] = textstat.linsear_write_formula(document)\n", + " file_dict['dale_chall_readability_score'] = textstat.dale_chall_readability_score(document)\n", + " file_dict['mcalpine_eflaw'] = textstat.mcalpine_eflaw(document)\n", + " file_dict['reading_time'] = textstat.reading_time(document, ms_per_char=14.69)\n", + " file_dict['char_count'] = textstat.char_count(document, ignore_spaces=True)\n", + " file_dict['word_count'] = textstat.lexicon_count(document, removepunct=True)\n", + " return file_dict\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b3c481e-c521-4e1d-926e-88f4b75ae7de", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "getting readability scoring for each type of document\n", + "'''\n", + "def generate_file(output_csv, wdirectory, document_type):\n", + " with open(output_csv, 'w') as csvfile: \n", + " writer = csv.DictWriter(csvfile, fieldnames = csv_fieldnames) \n", + " writer.writeheader() \n", + " subdirs = os.listdir(wdirectory)\n", + " print(document_type)\n", + " for dir in subdirs: \n", + " print(dir)\n", + " files = os.listdir(wdirectory + \"/\" + dir)\n", + " count = 0\n", + " for file in files:\n", + " file_dict = {\"subdir\": dir, \"filename\": file}\n", + " print(file)\n", + " full_address = wdirectory + \"/\" + dir + \"/\" + file\n", + " file_dict = get_readibility(full_address, file_dict)\n", + " writer.writerow(file_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e0a7b88-49b6-4053-84b8-f54f1c6536c0", + "metadata": {}, + "outputs": [], + "source": [ + "generate_file('dwo_readability_contributing.csv', contributing_wd, \"contributing\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/text_analysis/.ipynb_checkpoints/partitioning_dirs-checkpoint.ipynb b/text_analysis/.ipynb_checkpoints/partitioning_dirs-checkpoint.ipynb new file mode 100644 index 0000000..a5cdac3 --- /dev/null +++ b/text_analysis/.ipynb_checkpoints/partitioning_dirs-checkpoint.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "76168c17-548e-4bf2-a1fa-6c0b6372262a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/text_analysis/.ipynb_checkpoints/topicModel-checkpoint.ipynb b/text_analysis/.ipynb_checkpoints/topicModel-checkpoint.ipynb new file mode 100644 index 0000000..0d20717 --- /dev/null +++ b/text_analysis/.ipynb_checkpoints/topicModel-checkpoint.ipynb @@ -0,0 +1,339 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e09a84d6-cbd4-4a12-8e96-3775f734a262", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import numpy as np\n", + "import pandas as pd\n", + "import glob\n", + "import copy\n", + "import csv\n", + "from statistics import mean, median\n", + "from strip_markdown import strip_markdown\n", + "import joblib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9483091c-ac72-415c-932d-ac7cf7970789", + "metadata": {}, + "outputs": [], + "source": [ + "import gensim\n", + "import gensim.corpora as corpora\n", + "from gensim.utils import simple_preprocess\n", + "from gensim.models import CoherenceModel\n", + "from gensim.models.phrases import Phrases\n", + "\n", + "from sklearn.decomposition import LatentDirichletAllocation\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "\n", + "from statistics import mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3da6b590-875d-478d-aaaa-de020039c519", + "metadata": {}, + "outputs": [], + "source": [ + "# spacy and nltk for lemmatization\n", + "import nltk \n", + "#nltk.download('stopwords')\n", + "import spacy\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "\n", + "stopwords = stopwords.words('english')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60c137ae-6fe9-4b03-b899-6141b1645d6b", + "metadata": {}, + "outputs": [], + "source": [ + "def metadata_for_file(file):\n", + " word_list = file.split()\n", + " word_count = len(word_list)\n", + " #print(word_list)\n", + " if word_count == 0:\n", + " avg_word_length = 0\n", + " else: \n", + " avg_word_length = sum(map(len, word_list)) / len(word_list)\n", + " #return number of paragraphs\n", + " return word_count, avg_word_length" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e674fef-adb4-48c9-86a0-a655c41a95f3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_data_from_dir(directory):\n", + " files = glob.glob(f\"{directory}/*\")\n", + " data_list = []\n", + " word_counts = []\n", + " avg_word_lengths = []\n", + " file_list = []\n", + " for file in files:\n", + " text = open(file, encoding='utf-8').read()\n", + " #here's some of the descriptive text analysis\n", + " word_count, avg_word_length = metadata_for_file(text)\n", + " word_counts.append(word_count)\n", + " avg_word_lengths.append(avg_word_length)\n", + " #adding the data to the list of text\n", + " data_list.append(text)\n", + " #adding filename\n", + " file_list.append(file)\n", + " return data_list, word_counts, avg_word_lengths, file_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b332b10-bfc8-4566-8c52-19a8a334af00", + "metadata": {}, + "outputs": [], + "source": [ + "#preprocessing text data\n", + "def preprocess(corpus_list):\n", + " #extending stopwords \n", + " specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n", + " \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n", + " stopwords.extend(specific_stopwords)\n", + " D = copy.copy(corpus_list)\n", + " #stripping markdown from documents\n", + " D = [strip_markdown(doc) for doc in D]\n", + " #strip html \n", + " D = [re.sub(r'', '', doc, flags=re.DOTALL) for doc in D]\n", + " #mvp right now, can certainly be expanded as iterations of text analysis are done\n", + " D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n", + " lemmatizer = WordNetLemmatizer()\n", + " D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n", + " return D_lemma" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7", + "metadata": {}, + "outputs": [], + "source": [ + "#preparing processed data for model usage\n", + "def text_preparation(lemmatized_text):\n", + " #bigrams\n", + " D_bigrams = copy.copy(lemmatized_text)\n", + " bigram = Phrases(D_bigrams, min_count=2)\n", + " for i in range(len(lemmatized_text)):\n", + " for token in bigram[D_bigrams[i]]:\n", + " if '_' in token:\n", + " D_bigrams[i].append(token)\n", + " #id2word\n", + " id2word = corpora.Dictionary(D_bigrams)\n", + " id2word.filter_extremes(no_below=5, no_above=0.5)\n", + " #bow representation \n", + " bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n", + " return bag_of_words, id2word" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24799e25-2c0c-4e16-b503-68296f604f52", + "metadata": {}, + "outputs": [], + "source": [ + "def lda_model_identification(data_vectorized):\n", + " lda = LatentDirichletAllocation()\n", + " search_params = {'n_components': [TKTK], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] }\n", + " model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n", + " model.fit(data_vectorized)\n", + " best_lda_model = model.best_estimator_\n", + " print(\"Best Model's Params: \", model.best_params_)\n", + " print(\"Best Log Likelihood Score: \", model.best_score_)\n", + " print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5", + "metadata": {}, + "outputs": [], + "source": [ + "def best_lda_model(data_vectorized, vocab):\n", + " lda = LatentDirichletAllocation(n_components=TKTK, learning_decay = TKTK, batch_size = TKTK, max_iter = TKTK)\n", + " id_topic = lda.fit_transform(data_vectorized)\n", + " topic_words = {}\n", + " for topic, comp in enumerate(lda.components_):\n", + " word_idx = np.argsort(comp)[::-1][:10]\n", + " topic_words[topic] = [vocab[i] for i in word_idx]\n", + " for topic, words in topic_words.items():\n", + " print('Topic: %d' % topic)\n", + " print(' %s' % ', '.join(words))\n", + " #lda.print_topics(num_words=10)\n", + " joblib.dump(lda, '020125_DOCTYPE_lda.jl')\n", + " #lda = joblib.load('0509_lda.jl')\n", + " return id_topic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27", + "metadata": {}, + "outputs": [], + "source": [ + "def get_most_prevalent(vect_documents, documents):\n", + " lda = joblib.load('TKTK_lda.jl')\n", + " distributions = lda.transform(vect_documents)\n", + " most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"], 7: [0, \"\"]}\n", + " for i, topic_distribution in enumerate(distributions):\n", + " for j in range(8):\n", + " if topic_distribution[j] > most_prevalent[j][0]:\n", + " most_prevalent[j] = [topic_distribution[j], documents[i]]\n", + " print(most_prevalent)\n", + " return most_prevalent\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec", + "metadata": {}, + "outputs": [], + "source": [ + "def prevalent_topics(vect_documents, file_list):\n", + " lda = joblib.load('TKTKTKTK_lda.jl')\n", + " #lda = joblib.load('0514_contrib_lda.jl')\n", + " distributions = lda.transform(vect_documents)\n", + " #figuring out what the max distribution is and then figuring out the mode\n", + " top_topic = []\n", + " count_of_multiple = 0\n", + " topic_arrays = []\n", + " for i, topic_distribution in enumerate(distributions):\n", + " max_dist = max(topic_distribution)\n", + " indexes = np.where(topic_distribution == max_dist)[0]\n", + " if len(indexes) == 1:\n", + " top_topic.append(indexes[0])\n", + " else:\n", + " count_of_multiple += 1\n", + " topic_arrays.append(topic_distribution)\n", + " most_frequent(top_topic)\n", + " print(count_of_multiple)\n", + " df = pd.DataFrame(topic_arrays)\n", + " #finding the distribution values for all documents\n", + " with open('readme_file_topic_distributions.csv', 'w', newline='') as csvfile:\n", + " fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7']\n", + " writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n", + " writer.writeheader()\n", + " for i, row in df.iterrows():\n", + " project_dir = {}\n", + " project_dir['filename'] = file_list[i].split(\"/\")[-1]\n", + " array_row = df.iloc[i].to_numpy()\n", + " for j in range(8):\n", + " project_dir[\"t\" + str(j)] = array_row[j]\n", + " writer.writerow(project_dir)\n", + " #print(df.sort_values(by=['0']).head(5))\n", + " '''\n", + " for i in range(8):\n", + " print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n", + " top5 = df.nlargest(10, i)\n", + " top_indices = top5.index.to_list()\n", + " print(top5)\n", + " for index in top_indices:\n", + " print(file_list[index])\n", + " bottom5 = df.nsmallest(10, i)\n", + " bottom_indices = bottom5.index.to_list()\n", + " print(bottom5)\n", + " for index in bottom_indices:\n", + " print(file_list[index])\n", + " '''\n", + " averages = df.mean()\n", + " print(averages)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3", + "metadata": {}, + "outputs": [], + "source": [ + "def most_frequent(topic_prevalence):\n", + " most_frequent_array = []\n", + " for j in range(4):\n", + " topic = mode(topic_prevalence)\n", + " most_frequent_array.append(topic)\n", + " topic_prevalence = [i for i in topic_prevalence if i != topic]\n", + " print(most_frequent_array)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f937c2e-2714-475d-b670-602164c46642", + "metadata": {}, + "outputs": [], + "source": [ + "listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(readme_directory)\n", + "print(\"Mean wordcount: \", mean(wordcounts))\n", + "print(\"Median wordcount: \", median(wordcounts))\n", + "print(\"Mean wordlength: \", mean(wordlengths))\n", + "print(\"Median wordlength: \", median(wordlengths))\n", + "lemmatized_corpus = preprocess(listed_corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90e236f-8db5-40cc-88a3-60e674b9d1de", + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = CountVectorizer(analyzer='word', \n", + " min_df=2, \n", + " stop_words='english', \n", + " lowercase=True, \n", + " token_pattern='[a-zA-Z0-9]{2,}', \n", + " )\n", + "data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n", + "joblib.dump(vectorizer, '020125_DOCTYPE_vectorizer.joblib'" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/text_analysis/partitioned_readability.ipynb b/text_analysis/partitioned_readability.ipynb new file mode 100644 index 0000000..0bd6566 --- /dev/null +++ b/text_analysis/partitioned_readability.ipynb @@ -0,0 +1,112 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f4c4796f-d109-472d-8f9c-95c6ec85f757", + "metadata": {}, + "outputs": [], + "source": [ + "import os \n", + "import textstat\n", + "import csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f1f2fce-2335-4ee3-81f2-55822e2f63f9", + "metadata": {}, + "outputs": [], + "source": [ + "readme_wd = \"\"\n", + "contributing_wd = \"\"\n", + "\n", + "csv_fieldnames = ['subdir', 'filename', 'flesch_reading_ease', 'flesch_kincaid_grade', 'linsear_write_formula', 'dale_chall_readability_score', 'mcalpine_eflaw', 'reading_time', 'char_count', 'word_count']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0d3b5b1-ae97-4a46-95e0-92232c46c2fa", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "gets the 3 readability scores for each individual textfile\n", + "'''\n", + "def get_readibility(file_address, file_dict):\n", + " file = open(file_address, \"r\")\n", + " document = file.read()\n", + " file_dict['flesch_reading_ease'] = textstat.flesch_reading_ease(document)\n", + " file_dict['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(document)\n", + " file_dict['linsear_write_formula'] = textstat.linsear_write_formula(document)\n", + " file_dict['dale_chall_readability_score'] = textstat.dale_chall_readability_score(document)\n", + " file_dict['mcalpine_eflaw'] = textstat.mcalpine_eflaw(document)\n", + " file_dict['reading_time'] = textstat.reading_time(document, ms_per_char=14.69)\n", + " file_dict['char_count'] = textstat.char_count(document, ignore_spaces=True)\n", + " file_dict['word_count'] = textstat.lexicon_count(document, removepunct=True)\n", + " return file_dict\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b3c481e-c521-4e1d-926e-88f4b75ae7de", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "getting readability scoring for each type of document\n", + "'''\n", + "def generate_file(output_csv, wdirectory, document_type):\n", + " with open(output_csv, 'w') as csvfile: \n", + " writer = csv.DictWriter(csvfile, fieldnames = csv_fieldnames) \n", + " writer.writeheader() \n", + " subdirs = os.listdir(wdirectory)\n", + " print(document_type)\n", + " for dir in subdirs: \n", + " print(dir)\n", + " files = os.listdir(wdirectory + \"/\" + dir)\n", + " count = 0\n", + " for file in files:\n", + " file_dict = {\"subdir\": dir, \"filename\": file}\n", + " print(file)\n", + " full_address = wdirectory + \"/\" + dir + \"/\" + file\n", + " file_dict = get_readibility(full_address, file_dict)\n", + " writer.writerow(file_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e0a7b88-49b6-4053-84b8-f54f1c6536c0", + "metadata": {}, + "outputs": [], + "source": [ + "generate_file('dwo_readability_contributing.csv', contributing_wd, \"contributing\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/text_analysis/partitioning_dirs.ipynb b/text_analysis/partitioning_dirs.ipynb new file mode 100644 index 0000000..a5cdac3 --- /dev/null +++ b/text_analysis/partitioning_dirs.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "76168c17-548e-4bf2-a1fa-6c0b6372262a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/text_analysis/topicModel.ipynb b/text_analysis/topicModel.ipynb new file mode 100644 index 0000000..0d20717 --- /dev/null +++ b/text_analysis/topicModel.ipynb @@ -0,0 +1,339 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e09a84d6-cbd4-4a12-8e96-3775f734a262", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import numpy as np\n", + "import pandas as pd\n", + "import glob\n", + "import copy\n", + "import csv\n", + "from statistics import mean, median\n", + "from strip_markdown import strip_markdown\n", + "import joblib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9483091c-ac72-415c-932d-ac7cf7970789", + "metadata": {}, + "outputs": [], + "source": [ + "import gensim\n", + "import gensim.corpora as corpora\n", + "from gensim.utils import simple_preprocess\n", + "from gensim.models import CoherenceModel\n", + "from gensim.models.phrases import Phrases\n", + "\n", + "from sklearn.decomposition import LatentDirichletAllocation\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "\n", + "from statistics import mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3da6b590-875d-478d-aaaa-de020039c519", + "metadata": {}, + "outputs": [], + "source": [ + "# spacy and nltk for lemmatization\n", + "import nltk \n", + "#nltk.download('stopwords')\n", + "import spacy\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "\n", + "stopwords = stopwords.words('english')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60c137ae-6fe9-4b03-b899-6141b1645d6b", + "metadata": {}, + "outputs": [], + "source": [ + "def metadata_for_file(file):\n", + " word_list = file.split()\n", + " word_count = len(word_list)\n", + " #print(word_list)\n", + " if word_count == 0:\n", + " avg_word_length = 0\n", + " else: \n", + " avg_word_length = sum(map(len, word_list)) / len(word_list)\n", + " #return number of paragraphs\n", + " return word_count, avg_word_length" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e674fef-adb4-48c9-86a0-a655c41a95f3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_data_from_dir(directory):\n", + " files = glob.glob(f\"{directory}/*\")\n", + " data_list = []\n", + " word_counts = []\n", + " avg_word_lengths = []\n", + " file_list = []\n", + " for file in files:\n", + " text = open(file, encoding='utf-8').read()\n", + " #here's some of the descriptive text analysis\n", + " word_count, avg_word_length = metadata_for_file(text)\n", + " word_counts.append(word_count)\n", + " avg_word_lengths.append(avg_word_length)\n", + " #adding the data to the list of text\n", + " data_list.append(text)\n", + " #adding filename\n", + " file_list.append(file)\n", + " return data_list, word_counts, avg_word_lengths, file_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b332b10-bfc8-4566-8c52-19a8a334af00", + "metadata": {}, + "outputs": [], + "source": [ + "#preprocessing text data\n", + "def preprocess(corpus_list):\n", + " #extending stopwords \n", + " specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n", + " \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n", + " stopwords.extend(specific_stopwords)\n", + " D = copy.copy(corpus_list)\n", + " #stripping markdown from documents\n", + " D = [strip_markdown(doc) for doc in D]\n", + " #strip html \n", + " D = [re.sub(r'', '', doc, flags=re.DOTALL) for doc in D]\n", + " #mvp right now, can certainly be expanded as iterations of text analysis are done\n", + " D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n", + " lemmatizer = WordNetLemmatizer()\n", + " D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n", + " return D_lemma" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7", + "metadata": {}, + "outputs": [], + "source": [ + "#preparing processed data for model usage\n", + "def text_preparation(lemmatized_text):\n", + " #bigrams\n", + " D_bigrams = copy.copy(lemmatized_text)\n", + " bigram = Phrases(D_bigrams, min_count=2)\n", + " for i in range(len(lemmatized_text)):\n", + " for token in bigram[D_bigrams[i]]:\n", + " if '_' in token:\n", + " D_bigrams[i].append(token)\n", + " #id2word\n", + " id2word = corpora.Dictionary(D_bigrams)\n", + " id2word.filter_extremes(no_below=5, no_above=0.5)\n", + " #bow representation \n", + " bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n", + " return bag_of_words, id2word" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24799e25-2c0c-4e16-b503-68296f604f52", + "metadata": {}, + "outputs": [], + "source": [ + "def lda_model_identification(data_vectorized):\n", + " lda = LatentDirichletAllocation()\n", + " search_params = {'n_components': [TKTK], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] }\n", + " model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n", + " model.fit(data_vectorized)\n", + " best_lda_model = model.best_estimator_\n", + " print(\"Best Model's Params: \", model.best_params_)\n", + " print(\"Best Log Likelihood Score: \", model.best_score_)\n", + " print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5", + "metadata": {}, + "outputs": [], + "source": [ + "def best_lda_model(data_vectorized, vocab):\n", + " lda = LatentDirichletAllocation(n_components=TKTK, learning_decay = TKTK, batch_size = TKTK, max_iter = TKTK)\n", + " id_topic = lda.fit_transform(data_vectorized)\n", + " topic_words = {}\n", + " for topic, comp in enumerate(lda.components_):\n", + " word_idx = np.argsort(comp)[::-1][:10]\n", + " topic_words[topic] = [vocab[i] for i in word_idx]\n", + " for topic, words in topic_words.items():\n", + " print('Topic: %d' % topic)\n", + " print(' %s' % ', '.join(words))\n", + " #lda.print_topics(num_words=10)\n", + " joblib.dump(lda, '020125_DOCTYPE_lda.jl')\n", + " #lda = joblib.load('0509_lda.jl')\n", + " return id_topic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27", + "metadata": {}, + "outputs": [], + "source": [ + "def get_most_prevalent(vect_documents, documents):\n", + " lda = joblib.load('TKTK_lda.jl')\n", + " distributions = lda.transform(vect_documents)\n", + " most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"], 7: [0, \"\"]}\n", + " for i, topic_distribution in enumerate(distributions):\n", + " for j in range(8):\n", + " if topic_distribution[j] > most_prevalent[j][0]:\n", + " most_prevalent[j] = [topic_distribution[j], documents[i]]\n", + " print(most_prevalent)\n", + " return most_prevalent\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec", + "metadata": {}, + "outputs": [], + "source": [ + "def prevalent_topics(vect_documents, file_list):\n", + " lda = joblib.load('TKTKTKTK_lda.jl')\n", + " #lda = joblib.load('0514_contrib_lda.jl')\n", + " distributions = lda.transform(vect_documents)\n", + " #figuring out what the max distribution is and then figuring out the mode\n", + " top_topic = []\n", + " count_of_multiple = 0\n", + " topic_arrays = []\n", + " for i, topic_distribution in enumerate(distributions):\n", + " max_dist = max(topic_distribution)\n", + " indexes = np.where(topic_distribution == max_dist)[0]\n", + " if len(indexes) == 1:\n", + " top_topic.append(indexes[0])\n", + " else:\n", + " count_of_multiple += 1\n", + " topic_arrays.append(topic_distribution)\n", + " most_frequent(top_topic)\n", + " print(count_of_multiple)\n", + " df = pd.DataFrame(topic_arrays)\n", + " #finding the distribution values for all documents\n", + " with open('readme_file_topic_distributions.csv', 'w', newline='') as csvfile:\n", + " fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7']\n", + " writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n", + " writer.writeheader()\n", + " for i, row in df.iterrows():\n", + " project_dir = {}\n", + " project_dir['filename'] = file_list[i].split(\"/\")[-1]\n", + " array_row = df.iloc[i].to_numpy()\n", + " for j in range(8):\n", + " project_dir[\"t\" + str(j)] = array_row[j]\n", + " writer.writerow(project_dir)\n", + " #print(df.sort_values(by=['0']).head(5))\n", + " '''\n", + " for i in range(8):\n", + " print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n", + " top5 = df.nlargest(10, i)\n", + " top_indices = top5.index.to_list()\n", + " print(top5)\n", + " for index in top_indices:\n", + " print(file_list[index])\n", + " bottom5 = df.nsmallest(10, i)\n", + " bottom_indices = bottom5.index.to_list()\n", + " print(bottom5)\n", + " for index in bottom_indices:\n", + " print(file_list[index])\n", + " '''\n", + " averages = df.mean()\n", + " print(averages)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3", + "metadata": {}, + "outputs": [], + "source": [ + "def most_frequent(topic_prevalence):\n", + " most_frequent_array = []\n", + " for j in range(4):\n", + " topic = mode(topic_prevalence)\n", + " most_frequent_array.append(topic)\n", + " topic_prevalence = [i for i in topic_prevalence if i != topic]\n", + " print(most_frequent_array)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f937c2e-2714-475d-b670-602164c46642", + "metadata": {}, + "outputs": [], + "source": [ + "listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(readme_directory)\n", + "print(\"Mean wordcount: \", mean(wordcounts))\n", + "print(\"Median wordcount: \", median(wordcounts))\n", + "print(\"Mean wordlength: \", mean(wordlengths))\n", + "print(\"Median wordlength: \", median(wordlengths))\n", + "lemmatized_corpus = preprocess(listed_corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90e236f-8db5-40cc-88a3-60e674b9d1de", + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = CountVectorizer(analyzer='word', \n", + " min_df=2, \n", + " stop_words='english', \n", + " lowercase=True, \n", + " token_pattern='[a-zA-Z0-9]{2,}', \n", + " )\n", + "data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n", + "joblib.dump(vectorizer, '020125_DOCTYPE_vectorizer.joblib'" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}