1
0
govdoc-cr-analysis/text_analysis/contributingtopicModel.ipynb

797 lines
46 KiB
Plaintext
Raw Normal View History

2025-02-02 21:42:09 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "e09a84d6-cbd4-4a12-8e96-3775f734a262",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import numpy as np\n",
"import pandas as pd\n",
"import glob\n",
"import copy\n",
"import csv\n",
"from statistics import mean, median\n",
"from strip_markdown import strip_markdown\n",
"import joblib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9483091c-ac72-415c-932d-ac7cf7970789",
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"import gensim.corpora as corpora\n",
"from gensim.utils import simple_preprocess\n",
"from gensim.models import CoherenceModel\n",
"from gensim.models.phrases import Phrases\n",
"\n",
"from sklearn.decomposition import LatentDirichletAllocation\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"\n",
"from statistics import mode"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "196abd6a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] /home/SOC.NORTHWESTERN.EDU/nws8519/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3da6b590-875d-478d-aaaa-de020039c519",
"metadata": {},
"outputs": [],
"source": [
"# spacy and nltk for lemmatization\n",
"import nltk \n",
"#nltk.download('stopwords')\n",
"import spacy\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"\n",
"stopwords = stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "60c137ae-6fe9-4b03-b899-6141b1645d6b",
"metadata": {},
"outputs": [],
"source": [
"def metadata_for_file(file):\n",
" word_list = file.split()\n",
" word_count = len(word_list)\n",
" #print(word_list)\n",
" if word_count == 0:\n",
" avg_word_length = 0\n",
" else: \n",
" avg_word_length = sum(map(len, word_list)) / len(word_list)\n",
" #return number of paragraphs\n",
" return word_count, avg_word_length"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2e674fef-adb4-48c9-86a0-a655c41a95f3",
"metadata": {},
"outputs": [],
"source": [
"def get_data_from_dir(directory):\n",
" files = glob.glob(f\"{directory}/*\")\n",
" data_list = []\n",
" word_counts = []\n",
" avg_word_lengths = []\n",
" file_list = []\n",
" for file in files:\n",
" text = open(file, encoding='utf-8').read()\n",
" #here's some of the descriptive text analysis\n",
" word_count, avg_word_length = metadata_for_file(text)\n",
" word_counts.append(word_count)\n",
" avg_word_lengths.append(avg_word_length)\n",
" #adding the data to the list of text\n",
" data_list.append(text)\n",
" #adding filename\n",
" file_list.append(file)\n",
" return data_list, word_counts, avg_word_lengths, file_list"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2b332b10-bfc8-4566-8c52-19a8a334af00",
"metadata": {},
"outputs": [],
"source": [
"#preprocessing text data\n",
"def preprocess(corpus_list):\n",
" #extending stopwords \n",
" specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n",
" \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n",
" stopwords.extend(specific_stopwords)\n",
" D = copy.copy(corpus_list)\n",
" #stripping markdown from documents\n",
" D = [strip_markdown(doc) for doc in D]\n",
" #strip html \n",
" D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]\n",
" #mvp right now, can certainly be expanded as iterations of text analysis are done\n",
" D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n",
" lemmatizer = WordNetLemmatizer()\n",
" D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n",
" return D_lemma"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7",
"metadata": {},
"outputs": [],
"source": [
"#preparing processed data for model usage\n",
"def text_preparation(lemmatized_text):\n",
" #bigrams\n",
" D_bigrams = copy.copy(lemmatized_text)\n",
" bigram = Phrases(D_bigrams, min_count=2)\n",
" for i in range(len(lemmatized_text)):\n",
" for token in bigram[D_bigrams[i]]:\n",
" if '_' in token:\n",
" D_bigrams[i].append(token)\n",
" #id2word\n",
" id2word = corpora.Dictionary(D_bigrams)\n",
" id2word.filter_extremes(no_below=5, no_above=0.5)\n",
" #bow representation \n",
" bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n",
" return bag_of_words, id2word"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "24799e25-2c0c-4e16-b503-68296f604f52",
"metadata": {},
"outputs": [],
"source": [
"def lda_model_identification(data_vectorized):\n",
" lda = LatentDirichletAllocation()\n",
" search_params = {'n_components': [5], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] }\n",
" model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n",
" model.fit(data_vectorized)\n",
" best_lda_model = model.best_estimator_\n",
" print(\"Best Model's Params: \", model.best_params_)\n",
" print(\"Best Log Likelihood Score: \", model.best_score_)\n",
" print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5",
"metadata": {},
"outputs": [],
"source": [
"def best_lda_model(data_vectorized, vocab):\n",
" lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.7, batch_size = 256, max_iter = 50)\n",
" id_topic = lda.fit_transform(data_vectorized)\n",
" topic_words = {}\n",
" for topic, comp in enumerate(lda.components_):\n",
" word_idx = np.argsort(comp)[::-1][:10]\n",
" topic_words[topic] = [vocab[i] for i in word_idx]\n",
" for topic, words in topic_words.items():\n",
" print('Topic: %d' % topic)\n",
" print(' %s' % ', '.join(words))\n",
" #lda.print_topics(num_words=10)\n",
" joblib.dump(lda, '020125_CONTRIBUTING_lda.jl')\n",
" #lda = joblib.load('0509_lda.jl')\n",
" return id_topic"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27",
"metadata": {},
"outputs": [],
"source": [
"def get_most_prevalent(vect_documents, documents):\n",
" lda = joblib.load('020125_CONTRIBUTING_lda.jl')\n",
" distributions = lda.transform(vect_documents)\n",
" most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"]}\n",
" for i, topic_distribution in enumerate(distributions):\n",
" for j in range(5):\n",
" if topic_distribution[j] > most_prevalent[j][0]:\n",
" most_prevalent[j] = [topic_distribution[j], documents[i]]\n",
" print(most_prevalent)\n",
" return most_prevalent\n"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec",
"metadata": {},
"outputs": [],
"source": [
"def prevalent_topics(vect_documents, file_list):\n",
" lda = joblib.load('020125_CONTRIBUTING_lda.jl')\n",
" #lda = joblib.load('0514_contrib_lda.jl')\n",
" distributions = lda.transform(vect_documents)\n",
" #figuring out what the max distribution is and then figuring out the mode\n",
" top_topic = []\n",
" count_of_multiple = 0\n",
" topic_arrays = []\n",
" for i, topic_distribution in enumerate(distributions):\n",
" max_dist = max(topic_distribution)\n",
" indexes = np.where(topic_distribution == max_dist)[0]\n",
" if len(indexes) == 1:\n",
" top_topic.append(indexes[0])\n",
" else:\n",
" count_of_multiple += 1\n",
" topic_arrays.append(topic_distribution)\n",
" #most_frequent(top_topic)\n",
" print(count_of_multiple)\n",
" df = pd.DataFrame(topic_arrays)\n",
" #finding the distribution values for all documents\n",
" with open('020125_CONTRIBUTING_file_topic_distributions.csv', 'w', newline='') as csvfile:\n",
" fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4']\n",
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
" writer.writeheader()\n",
" for i, row in df.iterrows():\n",
" project_dir = {}\n",
" project_dir['filename'] = file_list[i].split(\"/\")[-1]\n",
" array_row = df.iloc[i].to_numpy()\n",
" for j in range(5):\n",
" project_dir[\"t\" + str(j)] = array_row[j]\n",
" writer.writerow(project_dir)\n",
" #print(df.sort_values(by=['0']).head(5))\n",
" for i in range(5):\n",
" print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n",
" top5 = df.nlargest(10, i)\n",
" top_indices = top5.index.to_list()\n",
" print(top5)\n",
" for index in top_indices:\n",
" print(file_list[index])\n",
" bottom5 = df.nsmallest(10, i)\n",
" bottom_indices = bottom5.index.to_list()\n",
" print(bottom5)\n",
" for index in bottom_indices:\n",
" print(file_list[index])\n",
" averages = df.mean()\n",
" print(averages)\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3",
"metadata": {},
"outputs": [],
"source": [
"def most_frequent(topic_prevalence):\n",
" most_frequent_array = []\n",
" for j in range(5):\n",
" topic = mode(topic_prevalence)\n",
" most_frequent_array.append(topic)\n",
" topic_prevalence = [i for i in topic_prevalence if i != topic]\n",
" print(most_frequent_array)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "69d606fd",
"metadata": {},
"outputs": [],
"source": [
"contributing_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/\""
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1f937c2e-2714-475d-b670-602164c46642",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean wordcount: 357.8979020979021\n",
"Median wordcount: 225\n",
"Mean wordlength: 6.345704522542385\n",
"Median wordlength: 5.778551532033426\n"
]
}
],
"source": [
"listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(contributing_directory)\n",
"print(\"Mean wordcount: \", mean(wordcounts))\n",
"print(\"Median wordcount: \", median(wordcounts))\n",
"print(\"Mean wordlength: \", mean(wordlengths))\n",
"print(\"Median wordlength: \", median(wordlengths))\n",
"lemmatized_corpus = preprocess(listed_corpus)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e90e236f-8db5-40cc-88a3-60e674b9d1de",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['020125_CONTRIBUTING_vectorizer.joblib']"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"vectorizer = CountVectorizer(analyzer='word', \n",
" min_df=2, \n",
" stop_words='english', \n",
" lowercase=True, \n",
" token_pattern='[a-zA-Z0-9]{2,}', \n",
" )\n",
"data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n",
"joblib.dump(vectorizer, '020125_CONTRIBUTING_vectorizer.joblib')\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "d68aaf7b",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = joblib.load('020125_CONTRIBUTING_vectorizer.joblib')\n",
"data_vectorized = vectorizer.transform(lemmatized_corpus) "
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "dd1a70c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
"[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........\n",
"[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-196851.911 total time= 2.1s\n",
"[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........\n",
"[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-168250.194 total time= 2.0s\n",
"[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........\n",
"[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-180223.622 total time= 2.0s\n",
"[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........\n",
"[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-183729.380 total time= 2.0s\n",
"[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........\n",
"[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-174617.480 total time= 2.0s\n",
"[CV 1/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=5..........\n",
"[CV 1/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=5;, score=-197553.780 total time= 2.0s\n",
"[CV 2/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=5..........\n",
"[CV 2/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=5;, score=-168410.434 total time= 1.9s\n",
"[CV 3/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=5..........\n",
"[CV 3/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=5;, score=-180343.045 total time= 2.0s\n",
"[CV 4/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=5..........\n",
"[CV 4/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=5;, score=-182641.089 total time= 1.9s\n",
"[CV 5/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=5..........\n",
"[CV 5/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=5;, score=-174709.936 total time= 1.9s\n",
"[CV 1/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=5..........\n",
"[CV 1/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=5;, score=-196647.084 total time= 1.9s\n",
"[CV 2/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=5..........\n",
"[CV 2/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=5;, score=-168704.239 total time= 1.9s\n",
"[CV 3/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=5..........\n",
"[CV 3/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=5;, score=-180953.203 total time= 2.0s\n",
"[CV 4/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=5..........\n",
"[CV 4/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=5;, score=-183528.875 total time= 2.0s\n",
"[CV 5/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=5..........\n",
"[CV 5/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=5;, score=-174199.789 total time= 1.9s\n",
"[CV 1/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=5..........\n",
"[CV 1/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=5;, score=-197018.748 total time= 2.0s\n",
"[CV 2/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=5..........\n",
"[CV 2/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=5;, score=-169232.243 total time= 2.1s\n",
"[CV 3/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=5..........\n",
"[CV 3/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=5;, score=-179211.983 total time= 1.8s\n",
"[CV 4/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=5..........\n",
"[CV 4/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=5;, score=-184133.365 total time= 2.0s\n",
"[CV 5/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=5..........\n",
"[CV 5/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=5;, score=-173599.255 total time= 1.9s\n",
"[CV 1/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=5..........\n",
"[CV 1/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=5;, score=-196318.238 total time= 2.0s\n",
"[CV 2/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=5..........\n",
"[CV 2/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=5;, score=-168220.802 total time= 2.0s\n",
"[CV 3/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=5..........\n",
"[CV 3/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=5;, score=-179868.443 total time= 2.0s\n",
"[CV 4/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=5..........\n",
"[CV 4/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=5;, score=-183635.089 total time= 2.0s\n",
"[CV 5/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=5..........\n",
"[CV 5/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=5;, score=-174366.279 total time= 1.9s\n",
"[CV 1/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=5..........\n",
"[CV 1/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=5;, score=-198572.375 total time= 2.1s\n",
"[CV 2/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=5..........\n",
"[CV 2/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=5;, score=-168368.889 total time= 2.0s\n",
"[CV 3/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=5..........\n",
"[CV 3/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=5;, score=-180544.446 total time= 2.0s\n",
"[CV 4/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=5..........\n",
"[CV 4/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=5;, score=-183649.566 total time= 1.9s\n",
"[CV 5/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=5..........\n",
"[CV 5/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=5;, score=-174712.347 total time= 1.9s\n",
"Best Model's Params: {'batch_size': 256, 'learning_decay': 0.7, 'n_components': 5}\n",
"Best Log Likelihood Score: -180481.7700481068\n",
"Model Perplexity: 876.0922389848871\n"
]
}
],
"source": [
"#lda_model_identification(data_vectorized)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "aa83d20f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Topic: 0\n",
" test, new, function, example, style, file, make, build, command, version\n",
"Topic: 1\n",
" test, issue, request, pull, bug, http, feature, git, make, install\n",
"Topic: 2\n",
" git, test, branch, change, commit, make, request, pull, release, master\n",
"Topic: 3\n",
" contribution, license, project, open, submit, developer, right, contributor, sign, patch\n",
"Topic: 4\n",
" issue, request, pull, bug, project, change, contributing, contribution, feature, open\n"
]
}
],
"source": [
"#topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f4345bd6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{0: [0.999495078557156, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenPrinting_cups_hullabaloo_CONTRIBUTING.txt'], 1: [0.9980153669818502, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/tantale_deprecated.git_hullabaloo_CONTRIBUTING.rst'], 2: [0.9989886873615608, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-firewall_hullabaloo_CONTRIBUTING.md'], 3: [0.9983908776533259, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/webcamoid_webcamoid.git_hullabaloo_CONTRIBUTING.md'], 4: [0.9980246890436791, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/processone_pkix.git_hullabaloo_CONTRIBUTING.md']}\n"
]
}
],
"source": [
"topic_prevalence = get_most_prevalent(data_vectorized, file_list)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "23468e82",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4\n",
"-----------------------Topic 0 --------------------------------\n",
" 0 1 2 3 4\n",
"536 0.999495 0.000126 0.000126 0.000127 0.000126\n",
"494 0.998076 0.000483 0.000480 0.000480 0.000481\n",
"403 0.997270 0.000682 0.000683 0.000677 0.000688\n",
"147 0.992964 0.001763 0.001779 0.001722 0.001773\n",
"564 0.992964 0.001763 0.001779 0.001722 0.001773\n",
"647 0.985526 0.013136 0.000446 0.000442 0.000450\n",
"106 0.985206 0.003688 0.003672 0.003728 0.003705\n",
"422 0.977476 0.000474 0.000469 0.000469 0.021112\n",
"502 0.967482 0.031760 0.000254 0.000251 0.000252\n",
"43 0.943894 0.001284 0.001282 0.052239 0.001301\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenPrinting_cups_hullabaloo_CONTRIBUTING.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/osmcode_libosmium.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/angband_angband_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/terser_terser_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/mishoo_UglifyJS2_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/iovisor_bpftrace_hullabaloo_CONTRIBUTING-TOOLS.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/GNOME_network-manager-applet_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/iovisor_bcc.git_hullabaloo_CONTRIBUTING-SCRIPTS.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pgbackrest_pgbackrest_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/math-comp_math-comp_hullabaloo_CONTRIBUTING.md\n",
" 0 1 2 3 4\n",
"29 0.000251 0.000252 0.998989 0.000255 0.000254\n",
"295 0.000251 0.000252 0.998989 0.000255 0.000254\n",
"496 0.000277 0.056604 0.856781 0.000277 0.086060\n",
"643 0.000299 0.255602 0.304359 0.276117 0.163624\n",
"467 0.000350 0.471728 0.451339 0.000347 0.076235\n",
"372 0.000371 0.631292 0.000374 0.000373 0.367590\n",
"23 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"596 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"621 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"184 0.000383 0.895847 0.103008 0.000380 0.000383\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-firewall_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-apache_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/Icinga_icingaweb2.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/opencontainers_runc_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pygraphviz_pygraphviz.git_hullabaloo_CONTRIBUTING.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/chaijs_chai_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-mysql_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-postgresql.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-concat_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/wtforms_flask-wtf.git_hullabaloo_CONTRIBUTING.rst\n",
"-----------------------Topic 1 --------------------------------\n",
" 0 1 2 3 4\n",
"109 0.000495 0.998015 0.000502 0.000491 0.000495\n",
"535 0.000507 0.997964 0.000514 0.000505 0.000510\n",
"173 0.000572 0.997721 0.000571 0.000569 0.000568\n",
"54 0.000598 0.997602 0.000594 0.000614 0.000592\n",
"546 0.000670 0.997320 0.000666 0.000679 0.000666\n",
"66 0.000672 0.997304 0.000674 0.000674 0.000676\n",
"220 0.000677 0.997289 0.000678 0.000677 0.000679\n",
"388 0.000701 0.997190 0.000703 0.000701 0.000704\n",
"227 0.000702 0.997190 0.000703 0.000702 0.000703\n",
"56 0.000720 0.997112 0.000723 0.000721 0.000723\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/tantale_deprecated.git_hullabaloo_CONTRIBUTING.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pallets-eco_flask-sqlalchemy_hullabaloo_CONTRIBUTING.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/scikit-learn_scikit-learn.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/libwww-perl_HTTP-Message.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/libwww-perl_URI.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/perl-catalyst_Catalyst-Authentication-Credential-HTTP_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/maxmind_geoip-api-perl.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/karenetheridge_B-Hooks-Parser.git_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/karenetheridge_Module-Manifest_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/karenetheridge_B-Hooks-OP-Check_hullabaloo_CONTRIBUTING\n",
" 0 1 2 3 4\n",
"536 0.999495 0.000126 0.000126 0.000127 0.000126\n",
"670 0.293267 0.000207 0.436114 0.000207 0.270205\n",
"29 0.000251 0.000252 0.998989 0.000255 0.000254\n",
"295 0.000251 0.000252 0.998989 0.000255 0.000254\n",
"261 0.883447 0.000317 0.115603 0.000314 0.000319\n",
"694 0.639602 0.000366 0.129429 0.000366 0.230238\n",
"23 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"596 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"621 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"477 0.155632 0.000388 0.701898 0.000386 0.141696\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenPrinting_cups_hullabaloo_CONTRIBUTING.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/dbus_dbus_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-firewall_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-apache_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/GNOME_folks_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/google_gopacket.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-mysql_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-postgresql.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-concat_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/ddclient_ddclient.git_hullabaloo_CONTRIBUTING.md\n",
"-----------------------Topic 2 --------------------------------\n",
" 0 1 2 3 4\n",
"29 0.000251 0.000252 0.998989 0.000255 0.000254\n",
"295 0.000251 0.000252 0.998989 0.000255 0.000254\n",
"23 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"596 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"621 0.000374 0.000376 0.998492 0.000379 0.000379\n",
"433 0.000398 0.000400 0.998403 0.000398 0.000401\n",
"222 0.003336 0.003295 0.986750 0.003282 0.003337\n",
"378 0.003568 0.003524 0.985751 0.003589 0.003568\n",
"711 0.003559 0.003649 0.985621 0.003575 0.003596\n",
"706 0.004627 0.004655 0.981576 0.004568 0.004574\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-firewall_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-apache_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-mysql_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-postgresql.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-concat_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-xinetd_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/jbeder_yaml-cpp_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/mapbox_mapnik-vector-tile.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OSGeo_shapelib.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/faye_faye_hullabaloo_CONTRIBUTING.md\n",
" 0 1 2 3 4\n",
"536 0.999495 0.000126 0.000126 0.000127 0.000126\n",
"502 0.967482 0.031760 0.000254 0.000251 0.000252\n",
"331 0.350577 0.425615 0.000275 0.000275 0.223258\n",
"100 0.922202 0.076817 0.000326 0.000328 0.000327\n",
"548 0.166048 0.474650 0.000343 0.000339 0.358619\n",
"69 0.032624 0.858172 0.000354 0.039253 0.069597\n",
"460 0.067754 0.551877 0.000361 0.000361 0.379647\n",
"372 0.000371 0.631292 0.000374 0.000373 0.367590\n",
"393 0.000404 0.000401 0.000401 0.998391 0.000403\n",
"71 0.000404 0.677912 0.000407 0.013453 0.307824\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenPrinting_cups_hullabaloo_CONTRIBUTING.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pgbackrest_pgbackrest_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/encode_django-rest-framework_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/rsms_inter_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/mathjax_MathJax_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pydicom_pydicom.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/ioquake_ioq3_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/chaijs_chai_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/webcamoid_webcamoid.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenTTD_OpenTTD.git_hullabaloo_CONTRIBUTING.md\n",
"-----------------------Topic 3 --------------------------------\n",
" 0 1 2 3 4\n",
"393 0.000404 0.000401 0.000401 0.998391 0.000403\n",
"259 0.001088 0.001087 0.001085 0.995648 0.001092\n",
"246 0.001119 0.001132 0.001119 0.995493 0.001138\n",
"541 0.001222 0.001235 0.001267 0.995034 0.001242\n",
"593 0.001240 0.001247 0.001254 0.995008 0.001251\n",
"25 0.001425 0.001434 0.001446 0.994258 0.001437\n",
"649 0.001425 0.001434 0.001446 0.994258 0.001437\n",
"317 0.001537 0.001544 0.001549 0.993824 0.001546\n",
"437 0.001748 0.001748 0.001765 0.992984 0.001756\n",
"134 0.001829 0.001834 0.001843 0.992659 0.001835\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/webcamoid_webcamoid.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/lxc_lxc.git_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/open-power_skiboot.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/lxc_lxcfs.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/ocaml_dune.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/janestreet_variantslib.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/janestreet_sexplib.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/lastpass_lastpass-cli_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pub_scm_linux_kernel_git_jberg_iw.git_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/sslmate_certspotter_hullabaloo_CONTRIBUTING\n",
" 0 1 2 3 4\n",
"536 0.999495 0.000126 0.000126 0.000127 0.000126\n",
"35 0.339901 0.060447 0.599367 0.000143 0.000143\n",
"652 0.432560 0.213230 0.353690 0.000146 0.000374\n",
"92 0.311952 0.101841 0.020145 0.000149 0.565914\n",
"377 0.243054 0.040215 0.353551 0.000206 0.362974\n",
"670 0.293267 0.000207 0.436114 0.000207 0.270205\n",
"74 0.257880 0.021230 0.366006 0.000219 0.354666\n",
"359 0.208595 0.562651 0.228310 0.000221 0.000223\n",
"363 0.204923 0.366357 0.258074 0.000240 0.170405\n",
"502 0.967482 0.031760 0.000254 0.000251 0.000252\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenPrinting_cups_hullabaloo_CONTRIBUTING.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/rdiff-backup_rdiff-backup.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/scipy_scipy.git_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/zkat_ssri.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/linuxmint_cjs.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/dbus_dbus_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/GNOME_gjs.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/sphinx-doc_sphinx.git_hullabaloo_CONTRIBUTING.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pyproj4_pyproj.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pgbackrest_pgbackrest_hullabaloo_CONTRIBUTING.md\n",
"-----------------------Topic 4 --------------------------------\n",
" 0 1 2 3 4\n",
"175 0.000490 0.000493 0.000497 0.000495 0.998025\n",
"607 0.000490 0.000493 0.000497 0.000495 0.998025\n",
"663 0.000490 0.000493 0.000497 0.000495 0.998025\n",
"686 0.000490 0.000493 0.000497 0.000495 0.998025\n",
"485 0.001467 0.001482 0.001464 0.001459 0.994127\n",
"369 0.001498 0.001494 0.001486 0.001527 0.993996\n",
"112 0.002279 0.002273 0.002292 0.002283 0.990873\n",
"635 0.002539 0.002564 0.002591 0.002599 0.989706\n",
"575 0.003807 0.003792 0.003766 0.003733 0.984901\n",
"151 0.004099 0.004152 0.004123 0.004177 0.983448\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/processone_pkix.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/processone_eimp.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/processone_stun.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/processone_fast_tls.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/ocaml_ocamlbuild.git_hullabaloo_CONTRIBUTING.adoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/eclipse_paho.mqtt.python.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/tpm2-software_tpm2-abrmd.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/Fluidsynth_fluidsynth.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/Leaflet_Leaflet_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/zopefoundation_zope.event_hullabaloo_CONTRIBUTING.md\n",
" 0 1 2 3 4\n",
"536 0.999495 0.000126 0.000126 0.000127 0.000126\n",
"35 0.339901 0.060447 0.599367 0.000143 0.000143\n",
"359 0.208595 0.562651 0.228310 0.000221 0.000223\n",
"502 0.967482 0.031760 0.000254 0.000251 0.000252\n",
"29 0.000251 0.000252 0.998989 0.000255 0.000254\n",
"295 0.000251 0.000252 0.998989 0.000255 0.000254\n",
"344 0.825088 0.051655 0.122628 0.000315 0.000314\n",
"281 0.259153 0.235774 0.504441 0.000315 0.000317\n",
"261 0.883447 0.000317 0.115603 0.000314 0.000319\n",
"100 0.922202 0.076817 0.000326 0.000328 0.000327\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenPrinting_cups_hullabaloo_CONTRIBUTING.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/rdiff-backup_rdiff-backup.git_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/sphinx-doc_sphinx.git_hullabaloo_CONTRIBUTING.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/pgbackrest_pgbackrest_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-firewall_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-apache_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/kilobyte_memkind_hullabaloo_CONTRIBUTING\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/xonsh_xonsh.git_hullabaloo_CONTRIBUTING.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/GNOME_folks_hullabaloo_CONTRIBUTING.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/rsms_inter_hullabaloo_CONTRIBUTING.md\n",
"0 0.160978\n",
"1 0.288113\n",
"2 0.173551\n",
"3 0.101544\n",
"4 0.275814\n",
"dtype: float64\n"
]
}
],
"source": [
"prevalent_topics(data_vectorized, file_list)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "95e3bfc9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0, 1, 2, 3, 4]\n"
]
}
],
"source": [
"most_frequent(topic_prevalence)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}