1
0
govdoc-cr-analysis/text_analysis/readmetopicModel.ipynb

1358 lines
94 KiB
Plaintext
Raw Normal View History

2025-02-02 21:42:09 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e09a84d6-cbd4-4a12-8e96-3775f734a262",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import numpy as np\n",
"import pandas as pd\n",
"import glob\n",
"import copy\n",
"import csv\n",
"from statistics import mean, median\n",
"from strip_markdown import strip_markdown\n",
"import joblib"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9483091c-ac72-415c-932d-ac7cf7970789",
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"import gensim.corpora as corpora\n",
"from gensim.utils import simple_preprocess\n",
"from gensim.models import CoherenceModel\n",
"from gensim.models.phrases import Phrases\n",
"\n",
"from sklearn.decomposition import LatentDirichletAllocation\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"\n",
"from statistics import mode"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "196abd6a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] /home/SOC.NORTHWESTERN.EDU/nws8519/nltk_data...\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#import nltk\n",
"#nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3da6b590-875d-478d-aaaa-de020039c519",
"metadata": {},
"outputs": [],
"source": [
"# spacy and nltk for lemmatization\n",
"import nltk \n",
"#nltk.download('stopwords')\n",
"import spacy\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"\n",
"stopwords = stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "60c137ae-6fe9-4b03-b899-6141b1645d6b",
"metadata": {},
"outputs": [],
"source": [
"def metadata_for_file(file):\n",
" word_list = file.split()\n",
" word_count = len(word_list)\n",
" #print(word_list)\n",
" if word_count == 0:\n",
" avg_word_length = 0\n",
" else: \n",
" avg_word_length = sum(map(len, word_list)) / len(word_list)\n",
" #return number of paragraphs\n",
" return word_count, avg_word_length"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2e674fef-adb4-48c9-86a0-a655c41a95f3",
"metadata": {},
"outputs": [],
"source": [
"def get_data_from_dir(directory):\n",
" files = glob.glob(f\"{directory}/*\")\n",
" data_list = []\n",
" word_counts = []\n",
" avg_word_lengths = []\n",
" file_list = []\n",
" for file in files:\n",
" text = open(file, encoding='utf-8', errors='ignore').read()\n",
" #here's some of the descriptive text analysis\n",
" word_count, avg_word_length = metadata_for_file(text)\n",
" word_counts.append(word_count)\n",
" avg_word_lengths.append(avg_word_length)\n",
" #adding the data to the list of text\n",
" data_list.append(text)\n",
" #adding filename\n",
" file_list.append(file)\n",
" return data_list, word_counts, avg_word_lengths, file_list"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2b332b10-bfc8-4566-8c52-19a8a334af00",
"metadata": {},
"outputs": [],
"source": [
"#preprocessing text data\n",
"def preprocess(corpus_list):\n",
" #extending stopwords \n",
" specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n",
" \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n",
" stopwords.extend(specific_stopwords)\n",
" D = copy.copy(corpus_list)\n",
" #stripping markdown from documents\n",
" D = [strip_markdown(doc) for doc in D]\n",
" #strip html \n",
" D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]\n",
" #mvp right now, can certainly be expanded as iterations of text analysis are done\n",
" D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n",
" lemmatizer = WordNetLemmatizer()\n",
" D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n",
" return D_lemma"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7",
"metadata": {},
"outputs": [],
"source": [
"#preparing processed data for model usage\n",
"def text_preparation(lemmatized_text):\n",
" #bigrams\n",
" D_bigrams = copy.copy(lemmatized_text)\n",
" bigram = Phrases(D_bigrams, min_count=2)\n",
" for i in range(len(lemmatized_text)):\n",
" for token in bigram[D_bigrams[i]]:\n",
" if '_' in token:\n",
" D_bigrams[i].append(token)\n",
" #id2word\n",
" id2word = corpora.Dictionary(D_bigrams)\n",
" id2word.filter_extremes(no_below=5, no_above=0.5)\n",
" #bow representation \n",
" bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n",
" return bag_of_words, id2word"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "24799e25-2c0c-4e16-b503-68296f604f52",
"metadata": {},
"outputs": [],
"source": [
"def lda_model_identification(data_vectorized):\n",
" lda = LatentDirichletAllocation()\n",
" search_params = {'n_components': [11], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] }\n",
" model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n",
" model.fit(data_vectorized)\n",
" best_lda_model = model.best_estimator_\n",
" print(\"Best Model's Params: \", model.best_params_)\n",
" print(\"Best Log Likelihood Score: \", model.best_score_)\n",
" print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5",
"metadata": {},
"outputs": [],
"source": [
"def best_lda_model(data_vectorized, vocab):\n",
" lda = LatentDirichletAllocation(n_components=11, learning_decay = 0.5, batch_size = 256, max_iter = 50)\n",
" id_topic = lda.fit_transform(data_vectorized)\n",
" topic_words = {}\n",
" for topic, comp in enumerate(lda.components_):\n",
" word_idx = np.argsort(comp)[::-1][:10]\n",
" topic_words[topic] = [vocab[i] for i in word_idx]\n",
" for topic, words in topic_words.items():\n",
" print('Topic: %d' % topic)\n",
" print(' %s' % ', '.join(words))\n",
" #lda.print_topics(num_words=10)\n",
" joblib.dump(lda, '020125_README_lda.jl')\n",
" #lda = joblib.load('0509_lda.jl')\n",
" return id_topic"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27",
"metadata": {},
"outputs": [],
"source": [
"def get_most_prevalent(vect_documents, documents):\n",
" lda = joblib.load('020125_README_lda.jl')\n",
" distributions = lda.transform(vect_documents)\n",
" most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"], 7: [0, \"\"], 8: [0, \"\"], 9: [0, \"\"], 10: [0, \"\"]}\n",
" for i, topic_distribution in enumerate(distributions):\n",
" for j in range(11):\n",
" if topic_distribution[j] > most_prevalent[j][0]:\n",
" most_prevalent[j] = [topic_distribution[j], documents[i]]\n",
" print(most_prevalent)\n",
" return most_prevalent\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec",
"metadata": {},
"outputs": [],
"source": [
"def prevalent_topics(vect_documents, file_list):\n",
" lda = joblib.load('020125_README_lda.jl')\n",
" #lda = joblib.load('0514_contrib_lda.jl')\n",
" distributions = lda.transform(vect_documents)\n",
" #figuring out what the max distribution is and then figuring out the mode\n",
" top_topic = []\n",
" count_of_multiple = 0\n",
" topic_arrays = []\n",
" for i, topic_distribution in enumerate(distributions):\n",
" max_dist = max(topic_distribution)\n",
" indexes = np.where(topic_distribution == max_dist)[0]\n",
" if len(indexes) == 1:\n",
" top_topic.append(indexes[0])\n",
" else:\n",
" count_of_multiple += 1\n",
" topic_arrays.append(topic_distribution)\n",
" #most_frequent(top_topic)\n",
" print(count_of_multiple)\n",
" df = pd.DataFrame(topic_arrays)\n",
" #finding the distribution values for all documents\n",
" with open('020125_README_file_topic_distributions.csv', 'w', newline='') as csvfile:\n",
" fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']\n",
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
" writer.writeheader()\n",
" for i, row in df.iterrows():\n",
" project_dir = {}\n",
" project_dir['filename'] = file_list[i].split(\"/\")[-1]\n",
" array_row = df.iloc[i].to_numpy()\n",
" for j in range(11):\n",
" project_dir[\"t\" + str(j)] = array_row[j]\n",
" writer.writerow(project_dir)\n",
" #print(df.sort_values(by=['0']).head(5))\n",
" for i in range(11):\n",
" print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n",
" top5 = df.nlargest(10, i)\n",
" top_indices = top5.index.to_list()\n",
" print(top5)\n",
" for index in top_indices:\n",
" print(file_list[index])\n",
" bottom5 = df.nsmallest(10, i)\n",
" bottom_indices = bottom5.index.to_list()\n",
" print(bottom5)\n",
" for index in bottom_indices:\n",
" print(file_list[index])\n",
" averages = df.mean()\n",
" print(averages)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3",
"metadata": {},
"outputs": [],
"source": [
"def most_frequent(topic_prevalence):\n",
" most_frequent_array = []\n",
" for j in range(11):\n",
" topic = mode(topic_prevalence)\n",
" most_frequent_array.append(topic)\n",
" topic_prevalence = [i for i in topic_prevalence if i != topic]\n",
" print(most_frequent_array)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "69d606fd",
"metadata": {},
"outputs": [],
"source": [
"readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "1f937c2e-2714-475d-b670-602164c46642",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean wordcount: 272.11756407241944\n",
"Median wordcount: 98\n",
"Mean wordlength: 6.0641336743311145\n",
"Median wordlength: 5.841463414634147\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/SOC.NORTHWESTERN.EDU/nws8519/anaconda3/lib/python3.12/html/parser.py:171: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n",
" k = self.parse_starttag(i)\n"
]
}
],
"source": [
"listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(readme_directory)\n",
"print(\"Mean wordcount: \", mean(wordcounts))\n",
"print(\"Median wordcount: \", median(wordcounts))\n",
"print(\"Mean wordlength: \", mean(wordlengths))\n",
"print(\"Median wordlength: \", median(wordlengths))\n",
"lemmatized_corpus = preprocess(listed_corpus)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e90e236f-8db5-40cc-88a3-60e674b9d1de",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['020125_README_vectorizer.joblib']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"'''\n",
"vectorizer = CountVectorizer(analyzer='word', \n",
" min_df=2, \n",
" stop_words='english', \n",
" lowercase=True, \n",
" token_pattern='[a-zA-Z0-9]{2,}', \n",
" )\n",
"data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n",
"joblib.dump(vectorizer, '020125_README_vectorizer.joblib')\n",
"'''\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "d68aaf7b",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = joblib.load('020125_README_vectorizer.joblib')\n",
"data_vectorized = vectorizer.transform(lemmatized_corpus) "
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "dd1a70c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
"[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
"[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1007509.681 total time= 10.3s\n",
"[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
"[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1014261.652 total time= 10.7s\n",
"[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
"[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1022848.244 total time= 10.3s\n",
"[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
"[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-973246.017 total time= 9.8s\n",
"[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
"[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-999233.122 total time= 9.8s\n",
"[CV 1/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
"[CV 1/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-1005592.521 total time= 9.5s\n",
"[CV 2/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
"[CV 2/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-1018157.449 total time= 9.9s\n",
"[CV 3/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
"[CV 3/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-1021034.619 total time= 10.0s\n",
"[CV 4/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
"[CV 4/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-975254.657 total time= 10.1s\n",
"[CV 5/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
"[CV 5/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-999502.591 total time= 9.9s\n",
"[CV 1/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
"[CV 1/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-1006733.511 total time= 9.8s\n",
"[CV 2/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
"[CV 2/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-1014617.289 total time= 9.8s\n",
"[CV 3/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
"[CV 3/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-1020025.742 total time= 10.0s\n",
"[CV 4/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
"[CV 4/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-974336.406 total time= 10.1s\n",
"[CV 5/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
"[CV 5/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-1002762.208 total time= 10.0s\n",
"[CV 1/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
"[CV 1/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-1002368.558 total time= 9.5s\n",
"[CV 2/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
"[CV 2/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-1011512.930 total time= 9.9s\n",
"[CV 3/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
"[CV 3/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-1021450.228 total time= 10.0s\n",
"[CV 4/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
"[CV 4/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-974933.561 total time= 9.7s\n",
"[CV 5/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
"[CV 5/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-1000500.033 total time= 9.9s\n",
"[CV 1/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
"[CV 1/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-1004646.970 total time= 9.6s\n",
"[CV 2/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
"[CV 2/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-1011587.159 total time= 9.8s\n",
"[CV 3/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
"[CV 3/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-1020348.275 total time= 9.8s\n",
"[CV 4/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
"[CV 4/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-974751.507 total time= 10.0s\n",
"[CV 5/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
"[CV 5/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-1001461.612 total time= 9.8s\n",
"[CV 1/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
"[CV 1/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-1005603.520 total time= 9.8s\n",
"[CV 2/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
"[CV 2/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-1014507.304 total time= 9.8s\n",
"[CV 3/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
"[CV 3/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-1022378.609 total time= 10.0s\n",
"[CV 4/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
"[CV 4/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-971582.299 total time= 9.9s\n",
"[CV 5/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
"[CV 5/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-1000234.956 total time= 9.7s\n",
"Best Model's Params: {'batch_size': 256, 'learning_decay': 0.5, 'n_components': 11}\n",
"Best Log Likelihood Score: -1002153.0620655585\n",
"Model Perplexity: 2065.7772975666703\n"
]
}
],
"source": [
"lda_model_identification(data_vectorized)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "aa83d20f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Topic: 0\n",
" obj, http, stream, filter, length, type, image, pypi, svg, page\n",
"Topic: 1\n",
" module, perl, test, make, server, cpan, install, command, version, process\n",
"Topic: 2\n",
" window, device, linux, bug, server, gnome, packet, network, support, work\n",
"Topic: 3\n",
" install, make, build, directory, package, file, configure, library, usr, path\n",
"Topic: 4\n",
" license, copyright, gnu, version, public, free, general, warranty, copy, library\n",
"Topic: 5\n",
" class, object, client, django, interface, json, key, request, new, url\n",
"Topic: 6\n",
" html, xml, node, like, make, using, page, language, library, graph\n",
"Topic: 7\n",
" test, version, project, git, google, package, setup, add, library, support\n",
"Topic: 8\n",
" table, function, default, mode, path, text, add, level, used, output\n",
"Topic: 9\n",
" file, image, support, format, library, font, version, read, example, html\n",
"Topic: 10\n",
" value, function, string, object, return, type, data, method, error, argument\n"
]
}
],
"source": [
"topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "f4345bd6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{0: [0.9998131703476353, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf'], 1: [0.9936580635354768, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README'], 2: [0.9992995657213791, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_muffin.git_hullabaloo_README'], 3: [0.988192939654375, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chewing_scim-chewing.git_hullabaloo_README'], 4: [0.9964897891037261, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_webmail.git_hullabaloo_README'], 5: [0.9943880112670485, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines-activemodel_hullabaloo_README.md'], 6: [0.999759729377782, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md'], 7: [0.998666933112433, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_photutils.git_hullabaloo_README.rst'], 8: [0.9996679425883734, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md'], 9: [0.99815957978939, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/agmartin_linuxdoc-tools_hullabaloo_README'], 10: [0.9996663626936376, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt']}\n"
]
},
{
"data": {
"text/plain": [
"{0: [0.9998131703476353,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf'],\n",
" 1: [0.9936580635354768,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README'],\n",
" 2: [0.9992995657213791,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_muffin.git_hullabaloo_README'],\n",
" 3: [0.988192939654375,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chewing_scim-chewing.git_hullabaloo_README'],\n",
" 4: [0.9964897891037261,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_webmail.git_hullabaloo_README'],\n",
" 5: [0.9943880112670485,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines-activemodel_hullabaloo_README.md'],\n",
" 6: [0.999759729377782,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md'],\n",
" 7: [0.998666933112433,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_photutils.git_hullabaloo_README.rst'],\n",
" 8: [0.9996679425883734,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md'],\n",
" 9: [0.99815957978939,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/agmartin_linuxdoc-tools_hullabaloo_README'],\n",
" 10: [0.9996663626936376,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt']}"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_most_prevalent(data_vectorized, file_list)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "23468e82",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"349\n",
"-----------------------Topic 0 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"939 0.986430 0.001357 0.001357 0.001357 0.001357 0.001357 0.001357 \n",
"1267 0.928259 0.000777 0.000777 0.064747 0.000777 0.000777 0.000777 \n",
"206 0.919651 0.001357 0.001357 0.001357 0.068137 0.001357 0.001357 \n",
"3626 0.908870 0.002066 0.002066 0.002066 0.002066 0.002066 0.002066 \n",
"2397 0.891207 0.001818 0.001818 0.001819 0.001818 0.092427 0.001818 \n",
"3630 0.889165 0.000587 0.000587 0.105556 0.000587 0.000587 0.000587 \n",
"1781 0.888760 0.000699 0.000699 0.104946 0.000699 0.000699 0.000699 \n",
"60 0.863787 0.001299 0.001299 0.001299 0.001299 0.001299 0.001299 \n",
"2792 0.848473 0.015152 0.015152 0.015154 0.015153 0.015154 0.015154 \n",
"\n",
" 7 8 9 10 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"939 0.001357 0.001357 0.001357 0.001357 \n",
"1267 0.000777 0.000777 0.000777 0.000777 \n",
"206 0.001357 0.001357 0.001357 0.001357 \n",
"3626 0.002066 0.002066 0.072534 0.002066 \n",
"2397 0.001818 0.001818 0.001819 0.001818 \n",
"3630 0.000587 0.000587 0.000587 0.000587 \n",
"1781 0.000699 0.000699 0.000699 0.000699 \n",
"60 0.001299 0.001299 0.001299 0.124523 \n",
"2792 0.015152 0.015152 0.015152 0.015153 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/zopefoundation_roman_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kingosticks_mopidy-tunein_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_astroplan.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ionelmc_python-tblib_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/pylons_plaster_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mopidy_mopidy-alsamixer_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tkem_mopidy-internetarchive_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Toilal_rebulk_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/halcy_Mastodon.py.git_hullabaloo_README.md\n",
" 0 1 2 3 4 5 6 \\\n",
"735 0.000018 0.000018 0.152543 0.000018 0.067391 0.000018 0.000018 \n",
"1321 0.000018 0.000018 0.462060 0.062600 0.306453 0.000018 0.000018 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"3565 0.000024 0.000024 0.000024 0.000024 0.000024 0.042916 0.000024 \n",
"1606 0.000025 0.000025 0.581536 0.337059 0.007607 0.000025 0.000025 \n",
"1107 0.000030 0.071306 0.000030 0.018016 0.011003 0.000030 0.000030 \n",
"3280 0.000032 0.000032 0.070475 0.026385 0.007691 0.000032 0.003867 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"1437 0.000038 0.000038 0.677241 0.075530 0.012854 0.000038 0.000038 \n",
"\n",
" 7 8 9 10 \n",
"735 0.333556 0.173726 0.272679 0.000018 \n",
"1321 0.000018 0.089744 0.000018 0.079036 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"3565 0.651042 0.000024 0.000024 0.305849 \n",
"1606 0.000025 0.000025 0.073622 0.000025 \n",
"1107 0.561982 0.061747 0.000030 0.275796 \n",
"3280 0.000032 0.820210 0.071213 0.000032 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"1437 0.000038 0.093611 0.120873 0.019700 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
"-----------------------Topic 1 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"2518 0.005935 0.993658 0.000045 0.000045 0.000045 0.000045 0.000045 \n",
"2677 0.000722 0.992785 0.000722 0.000722 0.000722 0.000722 0.000722 \n",
"4073 0.000967 0.990328 0.000967 0.000967 0.000967 0.000967 0.000967 \n",
"1755 0.001021 0.989785 0.001022 0.001022 0.001022 0.001022 0.001021 \n",
"2100 0.001045 0.989550 0.001045 0.001045 0.001045 0.001045 0.001045 \n",
"2535 0.001057 0.989428 0.001057 0.001057 0.001057 0.001057 0.001057 \n",
"222 0.001151 0.988492 0.001151 0.001151 0.001151 0.001151 0.001151 \n",
"1473 0.001245 0.987546 0.001245 0.001246 0.001246 0.001245 0.001245 \n",
"1226 0.001299 0.987012 0.001299 0.001299 0.001299 0.001299 0.001299 \n",
"3717 0.002525 0.974746 0.002525 0.002526 0.002526 0.002525 0.002525 \n",
"\n",
" 7 8 9 10 \n",
"2518 0.000045 0.000045 0.000045 0.000045 \n",
"2677 0.000722 0.000722 0.000722 0.000722 \n",
"4073 0.000967 0.000967 0.000967 0.000967 \n",
"1755 0.001022 0.001022 0.001022 0.001021 \n",
"2100 0.001045 0.001045 0.001045 0.001045 \n",
"2535 0.001057 0.001057 0.001057 0.001057 \n",
"222 0.001151 0.001151 0.001151 0.001151 \n",
"1473 0.001245 0.001245 0.001245 0.001245 \n",
"1226 0.001299 0.001299 0.001299 0.001299 \n",
"3717 0.002525 0.002525 0.002525 0.002525 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/book_Test-Database.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/shlomif_perl-io-socket-inet6.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/maddingue_SNMP-Extension-PassPersist.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/perl-openssl_perl-crypt-openssl-pkcs10.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/wchristian_crypt-dh.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/theory_class-meta.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ap_Test-File-Contents.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/porridge_ydpdict_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/p5-number-fraction_number-fraction.git_hullabaloo_README\n",
" 0 1 2 3 4 5 6 \\\n",
"735 0.000018 0.000018 0.152543 0.000018 0.067391 0.000018 0.000018 \n",
"1321 0.000018 0.000018 0.462060 0.062600 0.306453 0.000018 0.000018 \n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"3565 0.000024 0.000024 0.000024 0.000024 0.000024 0.042916 0.000024 \n",
"1606 0.000025 0.000025 0.581536 0.337059 0.007607 0.000025 0.000025 \n",
"3280 0.000032 0.000032 0.070475 0.026385 0.007691 0.000032 0.003867 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"1437 0.000038 0.000038 0.677241 0.075530 0.012854 0.000038 0.000038 \n",
"\n",
" 7 8 9 10 \n",
"735 0.333556 0.173726 0.272679 0.000018 \n",
"1321 0.000018 0.089744 0.000018 0.079036 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"3565 0.651042 0.000024 0.000024 0.305849 \n",
"1606 0.000025 0.000025 0.073622 0.000025 \n",
"3280 0.000032 0.820210 0.071213 0.000032 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"1437 0.000038 0.093611 0.120873 0.019700 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
"-----------------------Topic 2 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"1570 0.000070 0.000070 0.999300 0.000070 0.000070 0.000070 0.000070 \n",
"97 0.002410 0.000063 0.997025 0.000063 0.000063 0.000063 0.000063 \n",
"1076 0.001653 0.001653 0.983470 0.001653 0.001653 0.001653 0.001653 \n",
"2419 0.002331 0.002331 0.976687 0.002331 0.002331 0.002331 0.002331 \n",
"627 0.003955 0.003953 0.960470 0.003953 0.003953 0.003953 0.003953 \n",
"4107 0.004546 0.004546 0.954540 0.004546 0.004546 0.004546 0.004546 \n",
"3149 0.000410 0.065727 0.930587 0.000410 0.000410 0.000410 0.000410 \n",
"1864 0.007576 0.007576 0.924238 0.007576 0.007577 0.007576 0.007576 \n",
"288 0.007576 0.007576 0.924238 0.007577 0.007576 0.007576 0.007576 \n",
"1066 0.007576 0.007576 0.924235 0.007577 0.007576 0.007576 0.007578 \n",
"\n",
" 7 8 9 10 \n",
"1570 0.000070 0.000070 0.000070 0.000070 \n",
"97 0.000063 0.000063 0.000063 0.000063 \n",
"1076 0.001653 0.001653 0.001653 0.001653 \n",
"2419 0.002331 0.002331 0.002332 0.002331 \n",
"627 0.003953 0.003953 0.003953 0.003953 \n",
"4107 0.004546 0.004546 0.004546 0.004546 \n",
"3149 0.000410 0.000410 0.000410 0.000410 \n",
"1864 0.007577 0.007576 0.007576 0.007576 \n",
"288 0.007576 0.007576 0.007576 0.007576 \n",
"1066 0.007576 0.007576 0.007576 0.007576 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_muffin.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mate-desktop_marco.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_gdk-pixbuf_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/conserver_conserver.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ukui_peony_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mate-desktop_caja-actions_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mogaal_sendemail_hullabaloo_README-BR.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sdr_rtl-sdr.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/brendangregg_perf-tools_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_libwnck_hullabaloo_README\n",
" 0 1 2 3 4 5 6 \\\n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"3565 0.000024 0.000024 0.000024 0.000024 0.000024 0.042916 0.000024 \n",
"1107 0.000030 0.071306 0.000030 0.018016 0.011003 0.000030 0.000030 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"832 0.000043 0.000043 0.000043 0.000043 0.000043 0.000043 0.061009 \n",
"2063 0.009143 0.000045 0.000045 0.070350 0.000045 0.465237 0.020005 \n",
"2518 0.005935 0.993658 0.000045 0.000045 0.000045 0.000045 0.000045 \n",
"2068 0.000046 0.000046 0.000046 0.022781 0.000046 0.003475 0.000046 \n",
"\n",
" 7 8 9 10 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"3565 0.651042 0.000024 0.000024 0.305849 \n",
"1107 0.561982 0.061747 0.000030 0.275796 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"832 0.000043 0.349562 0.423503 0.165625 \n",
"2063 0.056883 0.147322 0.000045 0.230882 \n",
"2518 0.000045 0.000045 0.000045 0.000045 \n",
"2068 0.054426 0.103594 0.392211 0.423285 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jmtd_wadc_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/EsotericSoftware_kryo.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enthought_mayavi.git_hullabaloo_README.txt\n",
"-----------------------Topic 3 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"3197 0.001181 0.001181 0.001181 0.988193 0.001181 0.001181 0.001181 \n",
"959 0.001856 0.001855 0.001855 0.981446 0.001855 0.001855 0.001855 \n",
"3641 0.001934 0.001935 0.001934 0.980656 0.001934 0.001935 0.001934 \n",
"946 0.002066 0.002066 0.002066 0.979338 0.002066 0.002066 0.002066 \n",
"2855 0.002457 0.002457 0.002457 0.975428 0.002457 0.002457 0.002457 \n",
"1398 0.002755 0.002755 0.002755 0.972451 0.002755 0.002755 0.002755 \n",
"2019 0.002755 0.002755 0.002755 0.972451 0.002755 0.002755 0.002755 \n",
"321 0.002755 0.002755 0.002755 0.972450 0.002755 0.002755 0.002755 \n",
"2612 0.002933 0.002933 0.002933 0.970673 0.002933 0.002933 0.002933 \n",
"1510 0.003030 0.003030 0.003030 0.969696 0.003030 0.003031 0.003030 \n",
"\n",
" 7 8 9 10 \n",
"3197 0.001181 0.001181 0.001181 0.001181 \n",
"959 0.001855 0.001855 0.001855 0.001855 \n",
"3641 0.001935 0.001934 0.001934 0.001934 \n",
"946 0.002066 0.002066 0.002066 0.002066 \n",
"2855 0.002458 0.002457 0.002457 0.002457 \n",
"1398 0.002755 0.002755 0.002755 0.002755 \n",
"2019 0.002755 0.002755 0.002755 0.002755 \n",
"321 0.002755 0.002755 0.002755 0.002755 \n",
"2612 0.002933 0.002933 0.002933 0.002933 \n",
"1510 0.003030 0.003030 0.003030 0.003030 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chewing_scim-chewing.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/lastpass_lastpass-cli_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Ultimaker_libSavitar.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/systemd-cron_systemd-cron.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mati75_volumeicon-debian.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/x42_x42-plugins_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kiwix_libkiwix.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/rolinh_dfc_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/cairo_cairomm_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/dmc_gfal2_hullabaloo_README\n",
" 0 1 2 3 4 5 6 \\\n",
"735 0.000018 0.000018 0.152543 0.000018 0.067391 0.000018 0.000018 \n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"3565 0.000024 0.000024 0.000024 0.000024 0.000024 0.042916 0.000024 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"2084 0.000039 0.034171 0.902181 0.000039 0.000039 0.000039 0.000039 \n",
"832 0.000043 0.000043 0.000043 0.000043 0.000043 0.000043 0.061009 \n",
"2518 0.005935 0.993658 0.000045 0.000045 0.000045 0.000045 0.000045 \n",
"2829 0.000046 0.052600 0.069040 0.000046 0.048128 0.000046 0.000046 \n",
"\n",
" 7 8 9 10 \n",
"735 0.333556 0.173726 0.272679 0.000018 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"3565 0.651042 0.000024 0.000024 0.305849 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"2084 0.000039 0.000039 0.000039 0.063336 \n",
"832 0.000043 0.349562 0.423503 0.165625 \n",
"2518 0.000045 0.000045 0.000045 0.000045 \n",
"2829 0.000046 0.000046 0.000046 0.829909 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jmtd_wadc_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/PyGreSQL_PyGreSQL_hullabaloo_README\n",
"-----------------------Topic 4 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"3298 0.000351 0.000351 0.000351 0.000351 0.996490 0.000351 0.000351 \n",
"2734 0.000410 0.000410 0.000410 0.000410 0.995905 0.000410 0.000410 \n",
"1147 0.000457 0.000457 0.000457 0.000457 0.995431 0.000457 0.000457 \n",
"3491 0.000505 0.000505 0.000505 0.000505 0.994949 0.000505 0.000505 \n",
"4251 0.000544 0.000544 0.000544 0.000544 0.994556 0.000544 0.000544 \n",
"1665 0.000598 0.000598 0.000598 0.000598 0.994019 0.000598 0.000598 \n",
"2474 0.000606 0.000606 0.000606 0.000606 0.993939 0.000606 0.000606 \n",
"3518 0.000668 0.000669 0.000668 0.000669 0.993315 0.000668 0.000669 \n",
"2686 0.000805 0.000805 0.000805 0.000805 0.991955 0.000805 0.000805 \n",
"4028 0.000834 0.000834 0.000834 0.000834 0.991659 0.000834 0.000834 \n",
"\n",
" 7 8 9 10 \n",
"3298 0.000351 0.000351 0.000351 0.000351 \n",
"2734 0.000410 0.000410 0.000410 0.000410 \n",
"1147 0.000457 0.000457 0.000457 0.000457 \n",
"3491 0.000505 0.000505 0.000505 0.000505 \n",
"4251 0.000544 0.000544 0.000544 0.000544 \n",
"1665 0.000598 0.000598 0.000598 0.000598 \n",
"2474 0.000606 0.000606 0.000606 0.000606 \n",
"3518 0.000668 0.000669 0.000669 0.000669 \n",
"2686 0.000805 0.000805 0.000805 0.000805 \n",
"4028 0.000834 0.000834 0.000834 0.000834 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_webmail.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_groupware.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_imp.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_mnemo.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_kronolith.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_sesha.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_gollem.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ocaml-batteries-team_batteries-included.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/nxt-firmware.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_trean.git_hullabaloo_README\n",
" 0 1 2 3 4 5 6 \\\n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"3565 0.000024 0.000024 0.000024 0.000024 0.000024 0.042916 0.000024 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"2084 0.000039 0.034171 0.902181 0.000039 0.000039 0.000039 0.000039 \n",
"3594 0.000043 0.000043 0.025673 0.009215 0.000043 0.000043 0.000043 \n",
"832 0.000043 0.000043 0.000043 0.000043 0.000043 0.000043 0.061009 \n",
"3488 0.000044 0.000044 0.016315 0.037750 0.000044 0.237159 0.020084 \n",
"2063 0.009143 0.000045 0.000045 0.070350 0.000045 0.465237 0.020005 \n",
"\n",
" 7 8 9 10 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"3565 0.651042 0.000024 0.000024 0.305849 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"2084 0.000039 0.000039 0.000039 0.063336 \n",
"3594 0.000043 0.080246 0.000043 0.884566 \n",
"832 0.000043 0.349562 0.423503 0.165625 \n",
"3488 0.000044 0.000044 0.499671 0.188801 \n",
"2063 0.056883 0.147322 0.000045 0.230882 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_genius_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jmtd_wadc_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/OpenPrinting_foomatic-db.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/EsotericSoftware_kryo.git_hullabaloo_README.md\n",
"-----------------------Topic 5 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"1476 0.000561 0.000561 0.000561 0.000561 0.000561 0.994388 0.000561 \n",
"2071 0.000623 0.000623 0.000623 0.000623 0.000623 0.993773 0.000623 \n",
"418 0.000819 0.000819 0.000819 0.000819 0.000819 0.991810 0.000819 \n",
"4222 0.001196 0.001196 0.001196 0.001196 0.001196 0.988037 0.001196 \n",
"3804 0.001280 0.001280 0.001281 0.001280 0.001280 0.987195 0.001281 \n",
"168 0.001377 0.001378 0.001377 0.001378 0.001378 0.986225 0.001377 \n",
"3429 0.001466 0.001466 0.001466 0.001466 0.001466 0.985337 0.001466 \n",
"1651 0.001567 0.001567 0.001567 0.001567 0.001567 0.984325 0.001567 \n",
"3185 0.001653 0.001653 0.001653 0.001653 0.001653 0.983470 0.001653 \n",
"411 0.001748 0.001748 0.001748 0.001748 0.001748 0.982517 0.001748 \n",
"\n",
" 7 8 9 10 \n",
"1476 0.000561 0.000561 0.000561 0.000561 \n",
"2071 0.000623 0.000623 0.000623 0.000623 \n",
"418 0.000819 0.000819 0.000819 0.000819 \n",
"4222 0.001196 0.001196 0.001196 0.001196 \n",
"3804 0.001280 0.001281 0.001280 0.001281 \n",
"168 0.001378 0.001377 0.001377 0.001378 \n",
"3429 0.001466 0.001466 0.001466 0.001466 \n",
"1651 0.001568 0.001567 0.001567 0.001567 \n",
"3185 0.001653 0.001653 0.001653 0.001653 \n",
"411 0.001748 0.001748 0.001748 0.001748 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines-activemodel_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/rails_jbuilder_hullabaloo_README.rd\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines-activerecord_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ruby-amqp_amqp.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/lostisland_faraday_middleware_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ruby-concurrency_thread_safe_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/doorkeeper-gem_doorkeeper-openid_connect.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/rails_rails-dom-testing_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/lwe_entypo-rails_hullabaloo_README.md\n",
" 0 1 2 3 4 5 6 \\\n",
"735 0.000018 0.000018 0.152543 0.000018 0.067391 0.000018 0.000018 \n",
"1321 0.000018 0.000018 0.462060 0.062600 0.306453 0.000018 0.000018 \n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"1606 0.000025 0.000025 0.581536 0.337059 0.007607 0.000025 0.000025 \n",
"1107 0.000030 0.071306 0.000030 0.018016 0.011003 0.000030 0.000030 \n",
"3280 0.000032 0.000032 0.070475 0.026385 0.007691 0.000032 0.003867 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"1437 0.000038 0.000038 0.677241 0.075530 0.012854 0.000038 0.000038 \n",
"\n",
" 7 8 9 10 \n",
"735 0.333556 0.173726 0.272679 0.000018 \n",
"1321 0.000018 0.089744 0.000018 0.079036 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"1606 0.000025 0.000025 0.073622 0.000025 \n",
"1107 0.561982 0.061747 0.000030 0.275796 \n",
"3280 0.000032 0.820210 0.071213 0.000032 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"1437 0.000038 0.093611 0.120873 0.019700 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
"-----------------------Topic 6 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"3770 0.001894 0.001894 0.001894 0.001894 0.001894 0.001894 0.981059 \n",
"277 0.002597 0.002598 0.002598 0.002598 0.002598 0.002598 0.974023 \n",
"2445 0.000610 0.000610 0.000610 0.000610 0.000610 0.000610 0.958713 \n",
"3773 0.004545 0.004546 0.004546 0.004546 0.004546 0.004546 0.954543 \n",
"2176 0.004785 0.004785 0.004785 0.004785 0.004785 0.004785 0.952150 \n",
"2683 0.004785 0.004785 0.004785 0.004785 0.004785 0.004785 0.952150 \n",
"3097 0.004785 0.004785 0.004785 0.004785 0.004785 0.004785 0.952150 \n",
"3898 0.004785 0.004785 0.004785 0.004785 0.004785 0.004785 0.952150 \n",
"1438 0.000092 0.000092 0.000092 0.000092 0.000092 0.000092 0.951493 \n",
"\n",
" 7 8 9 10 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"3770 0.001894 0.001894 0.001894 0.001894 \n",
"277 0.002598 0.002598 0.002598 0.002598 \n",
"2445 0.000610 0.000610 0.035795 0.000610 \n",
"3773 0.004546 0.004546 0.004546 0.004546 \n",
"2176 0.004786 0.004785 0.004785 0.004785 \n",
"2683 0.004786 0.004785 0.004785 0.004785 \n",
"3097 0.004786 0.004785 0.004785 0.004785 \n",
"3898 0.004786 0.004785 0.004785 0.004785 \n",
"1438 0.000092 0.000092 0.000092 0.047675 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jquery_sizzle.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/gregkh_bti.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tavianator_bfs.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/xonsh_xonsh.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/angband_angband_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/subdownloader_subdownloader_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Beep6581_RawTherapee_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mlpack_mlpack_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/erlware_erlware_commons.git_hullabaloo_README.md\n",
" 0 1 2 3 4 5 6 \\\n",
"735 0.000018 0.000018 0.152543 0.000018 0.067391 0.000018 0.000018 \n",
"1321 0.000018 0.000018 0.462060 0.062600 0.306453 0.000018 0.000018 \n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"3565 0.000024 0.000024 0.000024 0.000024 0.000024 0.042916 0.000024 \n",
"1606 0.000025 0.000025 0.581536 0.337059 0.007607 0.000025 0.000025 \n",
"1107 0.000030 0.071306 0.000030 0.018016 0.011003 0.000030 0.000030 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"1437 0.000038 0.000038 0.677241 0.075530 0.012854 0.000038 0.000038 \n",
"2084 0.000039 0.034171 0.902181 0.000039 0.000039 0.000039 0.000039 \n",
"\n",
" 7 8 9 10 \n",
"735 0.333556 0.173726 0.272679 0.000018 \n",
"1321 0.000018 0.089744 0.000018 0.079036 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"3565 0.651042 0.000024 0.000024 0.305849 \n",
"1606 0.000025 0.000025 0.073622 0.000025 \n",
"1107 0.561982 0.061747 0.000030 0.275796 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"1437 0.000038 0.093611 0.120873 0.019700 \n",
"2084 0.000039 0.000039 0.000039 0.063336 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"-----------------------Topic 7 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"122 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 \n",
"3001 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 \n",
"4122 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 \n",
"3720 0.001515 0.001515 0.001515 0.001515 0.001515 0.001515 0.001515 \n",
"507 0.001567 0.001567 0.001567 0.001568 0.001568 0.001567 0.001567 \n",
"1763 0.001623 0.001624 0.001623 0.001624 0.001624 0.001623 0.001623 \n",
"3670 0.001653 0.001653 0.001653 0.001653 0.001653 0.001653 0.001653 \n",
"2801 0.002066 0.002066 0.002066 0.002066 0.002066 0.002066 0.002066 \n",
"1117 0.002066 0.002066 0.002066 0.002066 0.002066 0.002066 0.002066 \n",
"2984 0.002066 0.002066 0.002066 0.002066 0.002066 0.002066 0.002066 \n",
"\n",
" 7 8 9 10 \n",
"122 0.998667 0.000133 0.000133 0.000133 \n",
"3001 0.998667 0.000133 0.000133 0.000133 \n",
"4122 0.998667 0.000133 0.000133 0.000133 \n",
"3720 0.984847 0.001515 0.001515 0.001515 \n",
"507 0.984325 0.001567 0.001567 0.001567 \n",
"1763 0.983765 0.001623 0.001623 0.001623 \n",
"3670 0.983470 0.001653 0.001653 0.001653 \n",
"2801 0.979338 0.002066 0.002066 0.002066 \n",
"1117 0.979338 0.002066 0.002066 0.002066 \n",
"2984 0.979338 0.002066 0.002066 0.002066 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_photutils.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_astroquery.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_ccdproc.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jpastuszek_capture-output_hullabaloo_README.rdoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kaminari_kaminari_hullabaloo_README.rdoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tmuxinator_tmuxinator_hullabaloo_README.rdoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/duritong_trocla_hullabaloo_README.rdoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/technicalpickles_homesick_hullabaloo_README.rdoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/hipchat_hipchat-rb_hullabaloo_README.rdoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/cucumber_aruba.git_hullabaloo_README.rdoc\n",
" 0 1 2 3 4 5 6 \\\n",
"1321 0.000018 0.000018 0.462060 0.062600 0.306453 0.000018 0.000018 \n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"1606 0.000025 0.000025 0.581536 0.337059 0.007607 0.000025 0.000025 \n",
"3280 0.000032 0.000032 0.070475 0.026385 0.007691 0.000032 0.003867 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"1437 0.000038 0.000038 0.677241 0.075530 0.012854 0.000038 0.000038 \n",
"2084 0.000039 0.034171 0.902181 0.000039 0.000039 0.000039 0.000039 \n",
"3594 0.000043 0.000043 0.025673 0.009215 0.000043 0.000043 0.000043 \n",
"\n",
" 7 8 9 10 \n",
"1321 0.000018 0.089744 0.000018 0.079036 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"1606 0.000025 0.000025 0.073622 0.000025 \n",
"3280 0.000032 0.820210 0.071213 0.000032 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"1437 0.000038 0.093611 0.120873 0.019700 \n",
"2084 0.000039 0.000039 0.000039 0.063336 \n",
"3594 0.000043 0.080246 0.000043 0.884566 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_genius_hullabaloo_README\n",
"-----------------------Topic 8 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"1910 0.000050 0.000050 0.000050 0.000050 0.000050 0.000050 0.000050 \n",
"2782 0.000125 0.000125 0.000125 0.000125 0.000125 0.000125 0.000125 \n",
"3940 0.009091 0.009091 0.009092 0.009091 0.009091 0.009091 0.009092 \n",
"3740 0.000076 0.000076 0.000076 0.024915 0.006011 0.000076 0.000076 \n",
"1598 0.000526 0.000526 0.063883 0.000526 0.000525 0.000526 0.025487 \n",
"3147 0.012987 0.012987 0.012987 0.012987 0.012988 0.012988 0.012990 \n",
"2298 0.000096 0.000096 0.000096 0.000096 0.000096 0.000096 0.000096 \n",
"1744 0.001684 0.001684 0.001684 0.001684 0.001684 0.001684 0.126724 \n",
"310 0.015152 0.015153 0.015152 0.015155 0.015154 0.015153 0.015153 \n",
"\n",
" 7 8 9 10 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"1910 0.000050 0.999496 0.000050 0.000050 \n",
"2782 0.000125 0.998746 0.000125 0.000125 \n",
"3940 0.009091 0.909084 0.009092 0.009093 \n",
"3740 0.000076 0.907753 0.000076 0.060791 \n",
"1598 0.000526 0.906426 0.000526 0.000526 \n",
"3147 0.012988 0.870119 0.012989 0.012988 \n",
"2298 0.033925 0.865677 0.099632 0.000096 \n",
"1744 0.001684 0.858124 0.001684 0.001684 \n",
"310 0.015153 0.848467 0.015154 0.015154 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/trackballs_trackballs.git_hullabaloo_README.html\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/keras-team_keras_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/thlorenz_combine-source-map.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/pgRouting_pgrouting.git_hullabaloo_README.routing\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mawww_kakoune.git_hullabaloo_README.asciidoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tidymodels_recipes_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Parchive_par2cmdline.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/katomic.git_hullabaloo_README.levels\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/weaverba137_pydl_hullabaloo_README.txt\n",
" 0 1 2 3 4 5 6 \\\n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"3565 0.000024 0.000024 0.000024 0.000024 0.000024 0.042916 0.000024 \n",
"1606 0.000025 0.000025 0.581536 0.337059 0.007607 0.000025 0.000025 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"2084 0.000039 0.034171 0.902181 0.000039 0.000039 0.000039 0.000039 \n",
"3488 0.000044 0.000044 0.016315 0.037750 0.000044 0.237159 0.020084 \n",
"2518 0.005935 0.993658 0.000045 0.000045 0.000045 0.000045 0.000045 \n",
"2829 0.000046 0.052600 0.069040 0.000046 0.048128 0.000046 0.000046 \n",
"112 0.000051 0.000051 0.000051 0.000051 0.000051 0.000051 0.000051 \n",
"\n",
" 7 8 9 10 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"3565 0.651042 0.000024 0.000024 0.305849 \n",
"1606 0.000025 0.000025 0.073622 0.000025 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"2084 0.000039 0.000039 0.000039 0.063336 \n",
"3488 0.000044 0.000044 0.499671 0.188801 \n",
"2518 0.000045 0.000045 0.000045 0.000045 \n",
"2829 0.000046 0.000046 0.000046 0.829909 \n",
"112 0.564349 0.000051 0.000051 0.435188 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/OpenPrinting_foomatic-db.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/PyGreSQL_PyGreSQL_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_evolution_hullabaloo_README.TXT\n",
"-----------------------Topic 9 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"1961 0.000184 0.000184 0.000184 0.000184 0.000184 0.000184 0.000184 \n",
"2881 0.000255 0.000255 0.000255 0.000255 0.000255 0.000255 0.000255 \n",
"1106 0.000602 0.000602 0.000602 0.000602 0.000602 0.000602 0.000602 \n",
"1016 0.002114 0.002114 0.002114 0.002114 0.002115 0.002114 0.002114 \n",
"1574 0.002457 0.002457 0.002457 0.002457 0.002457 0.002457 0.002457 \n",
"2204 0.002755 0.002755 0.002755 0.002755 0.002755 0.002755 0.002755 \n",
"947 0.003953 0.003953 0.003953 0.003953 0.003953 0.003953 0.003953 \n",
"2956 0.000587 0.000587 0.040665 0.000587 0.000587 0.000587 0.000587 \n",
"1416 0.005051 0.005051 0.005051 0.005051 0.005051 0.005051 0.005051 \n",
"3068 0.005051 0.005051 0.005051 0.005051 0.005051 0.005051 0.005051 \n",
"\n",
" 7 8 9 10 \n",
"1961 0.000184 0.000184 0.998160 0.000184 \n",
"2881 0.000255 0.000255 0.997453 0.000255 \n",
"1106 0.000602 0.000602 0.993979 0.000602 \n",
"1016 0.002114 0.002115 0.978856 0.002114 \n",
"1574 0.002457 0.002457 0.975428 0.002457 \n",
"2204 0.002755 0.002755 0.972450 0.002755 \n",
"947 0.003953 0.003953 0.960471 0.003953 \n",
"2956 0.000587 0.000587 0.954056 0.000587 \n",
"1416 0.005051 0.005051 0.949493 0.005051 \n",
"3068 0.005051 0.005051 0.949493 0.005051 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/agmartin_linuxdoc-tools_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/silx-kit_fabio.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/caseman_noise.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Bioconductor_GenomeInfoDb.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/dompdf_php-font-lib_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/freedoom_freedoom_hullabaloo_README.TXT\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/LLNL_sundials.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/WinFF_winff.git_hullabaloo_README-Presets.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ianare_exif-py_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/AltraMayor_f3.git_hullabaloo_README\n",
" 0 1 2 3 4 5 6 \\\n",
"1321 0.000018 0.000018 0.462060 0.062600 0.306453 0.000018 0.000018 \n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"3565 0.000024 0.000024 0.000024 0.000024 0.000024 0.042916 0.000024 \n",
"1107 0.000030 0.071306 0.000030 0.018016 0.011003 0.000030 0.000030 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"2084 0.000039 0.034171 0.902181 0.000039 0.000039 0.000039 0.000039 \n",
"3594 0.000043 0.000043 0.025673 0.009215 0.000043 0.000043 0.000043 \n",
"2063 0.009143 0.000045 0.000045 0.070350 0.000045 0.465237 0.020005 \n",
"\n",
" 7 8 9 10 \n",
"1321 0.000018 0.089744 0.000018 0.079036 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"3565 0.651042 0.000024 0.000024 0.305849 \n",
"1107 0.561982 0.061747 0.000030 0.275796 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"2084 0.000039 0.000039 0.000039 0.063336 \n",
"3594 0.000043 0.080246 0.000043 0.884566 \n",
"2063 0.056883 0.147322 0.000045 0.230882 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_genius_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/EsotericSoftware_kryo.git_hullabaloo_README.md\n",
"-----------------------Topic 10 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
"3776 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"2853 0.000130 0.000130 0.000130 0.000130 0.000130 0.000130 0.000130 \n",
"2955 0.000544 0.000544 0.000544 0.000544 0.000544 0.000544 0.000544 \n",
"1100 0.000100 0.000100 0.000100 0.000100 0.015886 0.000100 0.000100 \n",
"3123 0.002114 0.002114 0.002114 0.002114 0.002114 0.002114 0.002114 \n",
"817 0.000777 0.000777 0.000777 0.000777 0.000777 0.045709 0.000777 \n",
"1658 0.000089 0.063941 0.000089 0.000089 0.000089 0.000089 0.000089 \n",
"2834 0.008265 0.008265 0.008265 0.008265 0.008265 0.008265 0.008265 \n",
"913 0.001196 0.001196 0.001196 0.001196 0.079367 0.001196 0.001196 \n",
"2319 0.009091 0.009091 0.009092 0.009093 0.009091 0.009091 0.009091 \n",
"\n",
" 7 8 9 10 \n",
"3776 0.000033 0.000033 0.000033 0.999666 \n",
"2853 0.000130 0.000130 0.000130 0.998696 \n",
"2955 0.000544 0.000544 0.000544 0.994556 \n",
"1100 0.000100 0.000100 0.000100 0.983214 \n",
"3123 0.002114 0.002114 0.002114 0.978857 \n",
"817 0.000777 0.000777 0.000777 0.947298 \n",
"1658 0.000089 0.000089 0.000089 0.935259 \n",
"2834 0.008265 0.008265 0.008266 0.917351 \n",
"913 0.001196 0.001196 0.001196 0.909866 \n",
"2319 0.009092 0.009092 0.009092 0.909084 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/gkz_type-check.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mschilli_cache-historical-perl.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bestpractical_rt-extension-repeatticket_hullabaloo_README.pod\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/lunarmodules_say.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mikeboers_PyMemoize_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/perl5-utils_Params-Util_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/thlorenz_inline-source-map.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/osantana_dicteval.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kilobyte_termrec_hullabaloo_README\n",
" 0 1 2 3 4 5 6 \\\n",
"735 0.000018 0.000018 0.152543 0.000018 0.067391 0.000018 0.000018 \n",
"3304 0.999813 0.000019 0.000019 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 0.000024 0.000024 0.999760 \n",
"1606 0.000025 0.000025 0.581536 0.337059 0.007607 0.000025 0.000025 \n",
"3280 0.000032 0.000032 0.070475 0.026385 0.007691 0.000032 0.003867 \n",
"2157 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 0.000033 \n",
"2518 0.005935 0.993658 0.000045 0.000045 0.000045 0.000045 0.000045 \n",
"3939 0.000048 0.000048 0.129402 0.115562 0.002418 0.000048 0.056808 \n",
"1388 0.000050 0.234664 0.331442 0.264437 0.157592 0.000050 0.000050 \n",
"1910 0.000050 0.000050 0.000050 0.000050 0.000050 0.000050 0.000050 \n",
"\n",
" 7 8 9 10 \n",
"735 0.333556 0.173726 0.272679 0.000018 \n",
"3304 0.000019 0.000019 0.000019 0.000019 \n",
"376 0.000024 0.000024 0.000024 0.000024 \n",
"1606 0.000025 0.000025 0.073622 0.000025 \n",
"3280 0.000032 0.820210 0.071213 0.000032 \n",
"2157 0.000033 0.999668 0.000033 0.000033 \n",
"2518 0.000045 0.000045 0.000045 0.000045 \n",
"3939 0.014969 0.680601 0.000048 0.000048 \n",
"1388 0.000050 0.011566 0.000050 0.000050 \n",
"1910 0.000050 0.999496 0.000050 0.000050 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jrincayc_ucblogo-code.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/arno-iptables-firewall_aif.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/trackballs_trackballs.git_hullabaloo_README.html\n",
"0 0.034897\n",
"1 0.079768\n",
"2 0.096922\n",
"3 0.161341\n",
"4 0.110154\n",
"5 0.093408\n",
"6 0.070131\n",
"7 0.118367\n",
"8 0.061284\n",
"9 0.096849\n",
"10 0.076878\n",
"dtype: float64\n"
]
}
],
"source": [
"prevalent_topics(data_vectorized, file_list)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}