{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "e09a84d6-cbd4-4a12-8e96-3775f734a262", "metadata": {}, "outputs": [], "source": [ "import re\n", "import numpy as np\n", "import pandas as pd\n", "import glob\n", "import copy\n", "import csv\n", "from statistics import mean, median\n", "from strip_markdown import strip_markdown\n", "import joblib" ] }, { "cell_type": "code", "execution_count": null, "id": "9483091c-ac72-415c-932d-ac7cf7970789", "metadata": {}, "outputs": [], "source": [ "import gensim\n", "import gensim.corpora as corpora\n", "from gensim.utils import simple_preprocess\n", "from gensim.models import CoherenceModel\n", "from gensim.models.phrases import Phrases\n", "\n", "from sklearn.decomposition import LatentDirichletAllocation\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "\n", "from statistics import mode" ] }, { "cell_type": "code", "execution_count": null, "id": "3da6b590-875d-478d-aaaa-de020039c519", "metadata": {}, "outputs": [], "source": [ "# spacy and nltk for lemmatization\n", "import nltk \n", "#nltk.download('stopwords')\n", "import spacy\n", "from nltk.corpus import stopwords\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "\n", "stopwords = stopwords.words('english')" ] }, { "cell_type": "code", "execution_count": null, "id": "60c137ae-6fe9-4b03-b899-6141b1645d6b", "metadata": {}, "outputs": [], "source": [ "def metadata_for_file(file):\n", " word_list = file.split()\n", " word_count = len(word_list)\n", " #print(word_list)\n", " if word_count == 0:\n", " avg_word_length = 0\n", " else: \n", " avg_word_length = sum(map(len, word_list)) / len(word_list)\n", " #return number of paragraphs\n", " return word_count, avg_word_length" ] }, { "cell_type": "code", "execution_count": null, "id": "2e674fef-adb4-48c9-86a0-a655c41a95f3", "metadata": {}, "outputs": [], "source": [ "def get_data_from_dir(directory):\n", " files = glob.glob(f\"{directory}/*\")\n", " data_list = []\n", " word_counts = []\n", " avg_word_lengths = []\n", " file_list = []\n", " for file in files:\n", " text = open(file, encoding='utf-8').read()\n", " #here's some of the descriptive text analysis\n", " word_count, avg_word_length = metadata_for_file(text)\n", " word_counts.append(word_count)\n", " avg_word_lengths.append(avg_word_length)\n", " #adding the data to the list of text\n", " data_list.append(text)\n", " #adding filename\n", " file_list.append(file)\n", " return data_list, word_counts, avg_word_lengths, file_list" ] }, { "cell_type": "code", "execution_count": null, "id": "2b332b10-bfc8-4566-8c52-19a8a334af00", "metadata": {}, "outputs": [], "source": [ "#preprocessing text data\n", "def preprocess(corpus_list):\n", " #extending stopwords \n", " specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n", " \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n", " stopwords.extend(specific_stopwords)\n", " D = copy.copy(corpus_list)\n", " #stripping markdown from documents\n", " D = [strip_markdown(doc) for doc in D]\n", " #strip html \n", " D = [re.sub(r'', '', doc, flags=re.DOTALL) for doc in D]\n", " #mvp right now, can certainly be expanded as iterations of text analysis are done\n", " D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n", " lemmatizer = WordNetLemmatizer()\n", " D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n", " return D_lemma" ] }, { "cell_type": "code", "execution_count": null, "id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7", "metadata": {}, "outputs": [], "source": [ "#preparing processed data for model usage\n", "def text_preparation(lemmatized_text):\n", " #bigrams\n", " D_bigrams = copy.copy(lemmatized_text)\n", " bigram = Phrases(D_bigrams, min_count=2)\n", " for i in range(len(lemmatized_text)):\n", " for token in bigram[D_bigrams[i]]:\n", " if '_' in token:\n", " D_bigrams[i].append(token)\n", " #id2word\n", " id2word = corpora.Dictionary(D_bigrams)\n", " id2word.filter_extremes(no_below=5, no_above=0.5)\n", " #bow representation \n", " bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n", " return bag_of_words, id2word" ] }, { "cell_type": "code", "execution_count": null, "id": "24799e25-2c0c-4e16-b503-68296f604f52", "metadata": {}, "outputs": [], "source": [ "def lda_model_identification(data_vectorized):\n", " lda = LatentDirichletAllocation()\n", " search_params = {'n_components': [TKTK], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] }\n", " model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n", " model.fit(data_vectorized)\n", " best_lda_model = model.best_estimator_\n", " print(\"Best Model's Params: \", model.best_params_)\n", " print(\"Best Log Likelihood Score: \", model.best_score_)\n", " print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))" ] }, { "cell_type": "code", "execution_count": null, "id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5", "metadata": {}, "outputs": [], "source": [ "def best_lda_model(data_vectorized, vocab):\n", " lda = LatentDirichletAllocation(n_components=TKTK, learning_decay = TKTK, batch_size = TKTK, max_iter = TKTK)\n", " id_topic = lda.fit_transform(data_vectorized)\n", " topic_words = {}\n", " for topic, comp in enumerate(lda.components_):\n", " word_idx = np.argsort(comp)[::-1][:10]\n", " topic_words[topic] = [vocab[i] for i in word_idx]\n", " for topic, words in topic_words.items():\n", " print('Topic: %d' % topic)\n", " print(' %s' % ', '.join(words))\n", " #lda.print_topics(num_words=10)\n", " joblib.dump(lda, '020125_DOCTYPE_lda.jl')\n", " #lda = joblib.load('0509_lda.jl')\n", " return id_topic" ] }, { "cell_type": "code", "execution_count": null, "id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27", "metadata": {}, "outputs": [], "source": [ "def get_most_prevalent(vect_documents, documents):\n", " lda = joblib.load('TKTK_lda.jl')\n", " distributions = lda.transform(vect_documents)\n", " most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"], 7: [0, \"\"]}\n", " for i, topic_distribution in enumerate(distributions):\n", " for j in range(8):\n", " if topic_distribution[j] > most_prevalent[j][0]:\n", " most_prevalent[j] = [topic_distribution[j], documents[i]]\n", " print(most_prevalent)\n", " return most_prevalent\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec", "metadata": {}, "outputs": [], "source": [ "def prevalent_topics(vect_documents, file_list):\n", " lda = joblib.load('TKTKTKTK_lda.jl')\n", " #lda = joblib.load('0514_contrib_lda.jl')\n", " distributions = lda.transform(vect_documents)\n", " #figuring out what the max distribution is and then figuring out the mode\n", " top_topic = []\n", " count_of_multiple = 0\n", " topic_arrays = []\n", " for i, topic_distribution in enumerate(distributions):\n", " max_dist = max(topic_distribution)\n", " indexes = np.where(topic_distribution == max_dist)[0]\n", " if len(indexes) == 1:\n", " top_topic.append(indexes[0])\n", " else:\n", " count_of_multiple += 1\n", " topic_arrays.append(topic_distribution)\n", " most_frequent(top_topic)\n", " print(count_of_multiple)\n", " df = pd.DataFrame(topic_arrays)\n", " #finding the distribution values for all documents\n", " with open('readme_file_topic_distributions.csv', 'w', newline='') as csvfile:\n", " fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7']\n", " writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n", " writer.writeheader()\n", " for i, row in df.iterrows():\n", " project_dir = {}\n", " project_dir['filename'] = file_list[i].split(\"/\")[-1]\n", " array_row = df.iloc[i].to_numpy()\n", " for j in range(8):\n", " project_dir[\"t\" + str(j)] = array_row[j]\n", " writer.writerow(project_dir)\n", " #print(df.sort_values(by=['0']).head(5))\n", " '''\n", " for i in range(8):\n", " print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n", " top5 = df.nlargest(10, i)\n", " top_indices = top5.index.to_list()\n", " print(top5)\n", " for index in top_indices:\n", " print(file_list[index])\n", " bottom5 = df.nsmallest(10, i)\n", " bottom_indices = bottom5.index.to_list()\n", " print(bottom5)\n", " for index in bottom_indices:\n", " print(file_list[index])\n", " '''\n", " averages = df.mean()\n", " print(averages)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3", "metadata": {}, "outputs": [], "source": [ "def most_frequent(topic_prevalence):\n", " most_frequent_array = []\n", " for j in range(4):\n", " topic = mode(topic_prevalence)\n", " most_frequent_array.append(topic)\n", " topic_prevalence = [i for i in topic_prevalence if i != topic]\n", " print(most_frequent_array)" ] }, { "cell_type": "code", "execution_count": null, "id": "1f937c2e-2714-475d-b670-602164c46642", "metadata": {}, "outputs": [], "source": [ "listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(readme_directory)\n", "print(\"Mean wordcount: \", mean(wordcounts))\n", "print(\"Median wordcount: \", median(wordcounts))\n", "print(\"Mean wordlength: \", mean(wordlengths))\n", "print(\"Median wordlength: \", median(wordlengths))\n", "lemmatized_corpus = preprocess(listed_corpus)" ] }, { "cell_type": "code", "execution_count": null, "id": "e90e236f-8db5-40cc-88a3-60e674b9d1de", "metadata": {}, "outputs": [], "source": [ "vectorizer = CountVectorizer(analyzer='word', \n", " min_df=2, \n", " stop_words='english', \n", " lowercase=True, \n", " token_pattern='[a-zA-Z0-9]{2,}', \n", " )\n", "data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n", "joblib.dump(vectorizer, '020125_DOCTYPE_vectorizer.joblib'" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 5 }