2025-02-02 21:42:09 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e09a84d6-cbd4-4a12-8e96-3775f734a262",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import numpy as np\n",
"import pandas as pd\n",
"import glob\n",
"import copy\n",
"import csv\n",
"from statistics import mean, median\n",
"from strip_markdown import strip_markdown\n",
"import joblib"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9483091c-ac72-415c-932d-ac7cf7970789",
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"import gensim.corpora as corpora\n",
"from gensim.utils import simple_preprocess\n",
"from gensim.models import CoherenceModel\n",
"from gensim.models.phrases import Phrases\n",
"\n",
"from sklearn.decomposition import LatentDirichletAllocation\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"\n",
"from statistics import mode"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "196abd6a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] /home/SOC.NORTHWESTERN.EDU/nws8519/nltk_data...\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#import nltk\n",
"#nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3da6b590-875d-478d-aaaa-de020039c519",
"metadata": {},
"outputs": [],
"source": [
"# spacy and nltk for lemmatization\n",
"import nltk \n",
"#nltk.download('stopwords')\n",
"import spacy\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"\n",
"stopwords = stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "60c137ae-6fe9-4b03-b899-6141b1645d6b",
"metadata": {},
"outputs": [],
"source": [
"def metadata_for_file(file):\n",
" word_list = file.split()\n",
" word_count = len(word_list)\n",
" #print(word_list)\n",
" if word_count == 0:\n",
" avg_word_length = 0\n",
" else: \n",
" avg_word_length = sum(map(len, word_list)) / len(word_list)\n",
" #return number of paragraphs\n",
" return word_count, avg_word_length"
]
},
{
"cell_type": "code",
2025-02-04 01:06:13 +00:00
"execution_count": 5,
2025-02-02 21:42:09 +00:00
"id": "2e674fef-adb4-48c9-86a0-a655c41a95f3",
"metadata": {},
"outputs": [],
"source": [
"def get_data_from_dir(directory):\n",
" files = glob.glob(f\"{directory}/*\")\n",
" data_list = []\n",
" word_counts = []\n",
" avg_word_lengths = []\n",
" file_list = []\n",
" for file in files:\n",
" text = open(file, encoding='utf-8', errors='ignore').read()\n",
" #here's some of the descriptive text analysis\n",
" word_count, avg_word_length = metadata_for_file(text)\n",
" word_counts.append(word_count)\n",
" avg_word_lengths.append(avg_word_length)\n",
" #adding the data to the list of text\n",
" data_list.append(text)\n",
" #adding filename\n",
" file_list.append(file)\n",
" return data_list, word_counts, avg_word_lengths, file_list"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2b332b10-bfc8-4566-8c52-19a8a334af00",
"metadata": {},
"outputs": [],
"source": [
"#preprocessing text data\n",
"def preprocess(corpus_list):\n",
" #extending stopwords \n",
" specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n",
" \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n",
" stopwords.extend(specific_stopwords)\n",
" D = copy.copy(corpus_list)\n",
" #stripping markdown from documents\n",
" D = [strip_markdown(doc) for doc in D]\n",
" #strip html \n",
" D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]\n",
" #mvp right now, can certainly be expanded as iterations of text analysis are done\n",
" D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n",
" lemmatizer = WordNetLemmatizer()\n",
" D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n",
" return D_lemma"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7",
"metadata": {},
"outputs": [],
"source": [
"#preparing processed data for model usage\n",
"def text_preparation(lemmatized_text):\n",
" #bigrams\n",
" D_bigrams = copy.copy(lemmatized_text)\n",
" bigram = Phrases(D_bigrams, min_count=2)\n",
" for i in range(len(lemmatized_text)):\n",
" for token in bigram[D_bigrams[i]]:\n",
" if '_' in token:\n",
" D_bigrams[i].append(token)\n",
" #id2word\n",
" id2word = corpora.Dictionary(D_bigrams)\n",
" id2word.filter_extremes(no_below=5, no_above=0.5)\n",
" #bow representation \n",
" bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n",
" return bag_of_words, id2word"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "24799e25-2c0c-4e16-b503-68296f604f52",
"metadata": {},
"outputs": [],
"source": [
"def lda_model_identification(data_vectorized):\n",
" lda = LatentDirichletAllocation()\n",
" search_params = {'n_components': [11], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] }\n",
" model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n",
" model.fit(data_vectorized)\n",
" best_lda_model = model.best_estimator_\n",
" print(\"Best Model's Params: \", model.best_params_)\n",
" print(\"Best Log Likelihood Score: \", model.best_score_)\n",
" print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5",
"metadata": {},
"outputs": [],
"source": [
"def best_lda_model(data_vectorized, vocab):\n",
2025-02-04 01:06:13 +00:00
" lda = LatentDirichletAllocation(n_components=11, learning_decay = 0.9, batch_size = 256, max_iter = 50)\n",
2025-02-02 21:42:09 +00:00
" id_topic = lda.fit_transform(data_vectorized)\n",
" topic_words = {}\n",
" for topic, comp in enumerate(lda.components_):\n",
" word_idx = np.argsort(comp)[::-1][:10]\n",
" topic_words[topic] = [vocab[i] for i in word_idx]\n",
" for topic, words in topic_words.items():\n",
" print('Topic: %d' % topic)\n",
" print(' %s' % ', '.join(words))\n",
" #lda.print_topics(num_words=10)\n",
2025-02-04 01:06:13 +00:00
" joblib.dump(lda, '020325_README_lda.jl')\n",
2025-02-02 21:42:09 +00:00
" #lda = joblib.load('0509_lda.jl')\n",
" return id_topic"
]
},
{
"cell_type": "code",
2025-02-04 01:06:13 +00:00
"execution_count": 10,
2025-02-02 21:42:09 +00:00
"id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27",
"metadata": {},
"outputs": [],
"source": [
"def get_most_prevalent(vect_documents, documents):\n",
2025-02-04 01:06:13 +00:00
" lda = joblib.load('020325_README_lda.jl')\n",
2025-02-02 21:42:09 +00:00
" distributions = lda.transform(vect_documents)\n",
" most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"], 7: [0, \"\"], 8: [0, \"\"], 9: [0, \"\"], 10: [0, \"\"]}\n",
" for i, topic_distribution in enumerate(distributions):\n",
" for j in range(11):\n",
" if topic_distribution[j] > most_prevalent[j][0]:\n",
" most_prevalent[j] = [topic_distribution[j], documents[i]]\n",
" print(most_prevalent)\n",
" return most_prevalent\n"
]
},
{
"cell_type": "code",
2025-02-04 01:06:13 +00:00
"execution_count": 23,
2025-02-02 21:42:09 +00:00
"id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec",
"metadata": {},
"outputs": [],
"source": [
"def prevalent_topics(vect_documents, file_list):\n",
2025-02-04 01:06:13 +00:00
" lda = joblib.load('020325_README_lda.jl')\n",
2025-02-02 21:42:09 +00:00
" #lda = joblib.load('0514_contrib_lda.jl')\n",
" distributions = lda.transform(vect_documents)\n",
" #figuring out what the max distribution is and then figuring out the mode\n",
" top_topic = []\n",
" count_of_multiple = 0\n",
" topic_arrays = []\n",
" for i, topic_distribution in enumerate(distributions):\n",
" max_dist = max(topic_distribution)\n",
" indexes = np.where(topic_distribution == max_dist)[0]\n",
" if len(indexes) == 1:\n",
" top_topic.append(indexes[0])\n",
" else:\n",
" count_of_multiple += 1\n",
" topic_arrays.append(topic_distribution)\n",
" #most_frequent(top_topic)\n",
" print(count_of_multiple)\n",
" df = pd.DataFrame(topic_arrays)\n",
" #finding the distribution values for all documents\n",
2025-02-04 01:06:13 +00:00
" with open('020325_README_file_topic_distributions.csv', 'w', newline='') as csvfile:\n",
2025-02-02 21:42:09 +00:00
" fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']\n",
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
" writer.writeheader()\n",
" for i, row in df.iterrows():\n",
" project_dir = {}\n",
" project_dir['filename'] = file_list[i].split(\"/\")[-1]\n",
" array_row = df.iloc[i].to_numpy()\n",
" for j in range(11):\n",
" project_dir[\"t\" + str(j)] = array_row[j]\n",
" writer.writerow(project_dir)\n",
" #print(df.sort_values(by=['0']).head(5))\n",
" for i in range(11):\n",
" print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n",
" top5 = df.nlargest(10, i)\n",
" top_indices = top5.index.to_list()\n",
" print(top5)\n",
" for index in top_indices:\n",
" print(file_list[index])\n",
" bottom5 = df.nsmallest(10, i)\n",
" bottom_indices = bottom5.index.to_list()\n",
" print(bottom5)\n",
" for index in bottom_indices:\n",
" print(file_list[index])\n",
" averages = df.mean()\n",
" print(averages)\n"
]
},
{
"cell_type": "code",
2025-02-04 01:06:13 +00:00
"execution_count": 12,
2025-02-02 21:42:09 +00:00
"id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3",
"metadata": {},
"outputs": [],
"source": [
"def most_frequent(topic_prevalence):\n",
" most_frequent_array = []\n",
" for j in range(11):\n",
" topic = mode(topic_prevalence)\n",
" most_frequent_array.append(topic)\n",
" topic_prevalence = [i for i in topic_prevalence if i != topic]\n",
" print(most_frequent_array)"
]
},
{
"cell_type": "code",
2025-02-04 01:06:13 +00:00
"execution_count": 13,
2025-02-02 21:42:09 +00:00
"id": "69d606fd",
"metadata": {},
"outputs": [],
"source": [
"readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "1f937c2e-2714-475d-b670-602164c46642",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2025-02-04 01:06:13 +00:00
"Mean wordcount: 271.6877796091359\n",
2025-02-02 21:42:09 +00:00
"Median wordcount: 98\n",
2025-02-04 01:06:13 +00:00
"Mean wordlength: 6.063122274716372\n",
"Median wordlength: 5.841269841269841\n"
2025-02-02 21:42:09 +00:00
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/SOC.NORTHWESTERN.EDU/nws8519/anaconda3/lib/python3.12/html/parser.py:171: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n",
" k = self.parse_starttag(i)\n"
]
}
],
"source": [
"listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(readme_directory)\n",
"print(\"Mean wordcount: \", mean(wordcounts))\n",
"print(\"Median wordcount: \", median(wordcounts))\n",
"print(\"Mean wordlength: \", mean(wordlengths))\n",
"print(\"Median wordlength: \", median(wordlengths))\n",
"lemmatized_corpus = preprocess(listed_corpus)"
]
},
{
"cell_type": "code",
2025-02-04 01:06:13 +00:00
"execution_count": 15,
2025-02-02 21:42:09 +00:00
"id": "e90e236f-8db5-40cc-88a3-60e674b9d1de",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2025-02-04 01:06:13 +00:00
"['020325_README_vectorizer.joblib']"
2025-02-02 21:42:09 +00:00
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"'''\n",
"vectorizer = CountVectorizer(analyzer='word', \n",
" min_df=2, \n",
" stop_words='english', \n",
" lowercase=True, \n",
" token_pattern='[a-zA-Z0-9]{2,}', \n",
" )\n",
"data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n",
2025-02-04 01:06:13 +00:00
"joblib.dump(vectorizer, '020325_README_vectorizer.joblib')\n",
2025-02-02 21:42:09 +00:00
"'''\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "d68aaf7b",
"metadata": {},
"outputs": [],
"source": [
2025-02-04 01:06:13 +00:00
"vectorizer = joblib.load('020325_README_vectorizer.joblib')\n",
2025-02-02 21:42:09 +00:00
"data_vectorized = vectorizer.transform(lemmatized_corpus) "
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "dd1a70c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
"[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1005863.489 total time= 10.2s\n",
2025-02-02 21:42:09 +00:00
"[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1011357.156 total time= 10.0s\n",
2025-02-02 21:42:09 +00:00
"[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1015386.424 total time= 10.0s\n",
2025-02-02 21:42:09 +00:00
"[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-965023.515 total time= 10.3s\n",
2025-02-02 21:42:09 +00:00
"[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-994223.612 total time= 9.9s\n",
2025-02-02 21:42:09 +00:00
"[CV 1/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 1/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-1006613.702 total time= 9.9s\n",
2025-02-02 21:42:09 +00:00
"[CV 2/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 2/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-1013817.544 total time= 9.9s\n",
2025-02-02 21:42:09 +00:00
"[CV 3/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 3/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-1015692.660 total time= 10.0s\n",
2025-02-02 21:42:09 +00:00
"[CV 4/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 4/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-966771.244 total time= 10.4s\n",
2025-02-02 21:42:09 +00:00
"[CV 5/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 5/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=11;, score=-995596.978 total time= 10.3s\n",
2025-02-02 21:42:09 +00:00
"[CV 1/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 1/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-1005180.172 total time= 10.0s\n",
2025-02-02 21:42:09 +00:00
"[CV 2/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 2/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-1015590.801 total time= 10.6s\n",
2025-02-02 21:42:09 +00:00
"[CV 3/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 3/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-1018907.455 total time= 11.1s\n",
2025-02-02 21:42:09 +00:00
"[CV 4/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 4/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-964714.891 total time= 14.5s\n",
2025-02-02 21:42:09 +00:00
"[CV 5/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 5/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=11;, score=-996172.263 total time= 13.9s\n",
2025-02-02 21:42:09 +00:00
"[CV 1/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 1/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-1006251.961 total time= 13.9s\n",
2025-02-02 21:42:09 +00:00
"[CV 2/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 2/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-1016869.369 total time= 14.0s\n",
2025-02-02 21:42:09 +00:00
"[CV 3/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 3/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-1017346.297 total time= 14.2s\n",
2025-02-02 21:42:09 +00:00
"[CV 4/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 4/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-964440.209 total time= 13.9s\n",
2025-02-02 21:42:09 +00:00
"[CV 5/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 5/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=11;, score=-997104.875 total time= 14.1s\n",
2025-02-02 21:42:09 +00:00
"[CV 1/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 1/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-1005428.337 total time= 13.9s\n",
2025-02-02 21:42:09 +00:00
"[CV 2/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 2/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-1013241.313 total time= 14.0s\n",
2025-02-02 21:42:09 +00:00
"[CV 3/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 3/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-1014764.423 total time= 13.6s\n",
2025-02-02 21:42:09 +00:00
"[CV 4/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 4/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-964830.614 total time= 14.3s\n",
2025-02-02 21:42:09 +00:00
"[CV 5/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 5/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=11;, score=-996764.609 total time= 14.0s\n",
2025-02-02 21:42:09 +00:00
"[CV 1/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 1/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-1004883.859 total time= 13.6s\n",
2025-02-02 21:42:09 +00:00
"[CV 2/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 2/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-1007656.712 total time= 13.5s\n",
2025-02-02 21:42:09 +00:00
"[CV 3/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 3/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-1015740.335 total time= 14.0s\n",
2025-02-02 21:42:09 +00:00
"[CV 4/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 4/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-966718.005 total time= 13.8s\n",
2025-02-02 21:42:09 +00:00
"[CV 5/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=11.........\n",
2025-02-04 01:06:13 +00:00
"[CV 5/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=11;, score=-996819.504 total time= 13.8s\n",
"Best Model's Params: {'batch_size': 256, 'learning_decay': 0.9, 'n_components': 11}\n",
"Best Log Likelihood Score: -998363.6828381404\n",
"Model Perplexity: 2076.905945051809\n"
2025-02-02 21:42:09 +00:00
]
}
],
"source": [
"lda_model_identification(data_vectorized)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "aa83d20f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Topic: 0\n",
2025-02-04 01:06:13 +00:00
" test, library, object, google, include, class, interface, using, build, example\n",
2025-02-02 21:42:09 +00:00
"Topic: 1\n",
2025-02-04 01:06:13 +00:00
" server, client, option, command, network, device, port, support, interface, default\n",
2025-02-02 21:42:09 +00:00
"Topic: 2\n",
2025-02-04 01:06:13 +00:00
" value, function, string, data, object, return, type, table, method, error\n",
2025-02-02 21:42:09 +00:00
"Topic: 3\n",
2025-02-04 01:06:13 +00:00
" install, build, make, package, configure, debian, git, need, directory, gnome\n",
2025-02-02 21:42:09 +00:00
"Topic: 4\n",
2025-02-04 01:06:13 +00:00
" obj, filter, stream, length, type, page, count, parent, max, resource\n",
2025-02-02 21:42:09 +00:00
"Topic: 5\n",
2025-02-04 01:06:13 +00:00
" window, mode, color, game, key, menu, default, size, button, sound\n",
2025-02-02 21:42:09 +00:00
"Topic: 6\n",
2025-02-04 01:06:13 +00:00
" file, directory, path, install, make, command, default, version, option, usr\n",
2025-02-02 21:42:09 +00:00
"Topic: 7\n",
2025-02-04 01:06:13 +00:00
" license, version, gnu, http, public, free, general, copyright, project, install\n",
2025-02-02 21:42:09 +00:00
"Topic: 8\n",
2025-02-04 01:06:13 +00:00
" model, django, url, module, password, key, import, request, date, add\n",
2025-02-02 21:42:09 +00:00
"Topic: 9\n",
2025-02-04 01:06:13 +00:00
" library, file, version, make, module, perl, support, makefile, image, program\n",
2025-02-02 21:42:09 +00:00
"Topic: 10\n",
2025-02-04 01:06:13 +00:00
" html, git, copyright, license, copy, text, json, example, new, install\n"
2025-02-02 21:42:09 +00:00
]
}
],
"source": [
"topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "f4345bd6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{0: [0.9998131703476353, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf'], 1: [0.9936580635354768, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README'], 2: [0.9992995657213791, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_muffin.git_hullabaloo_README'], 3: [0.988192939654375, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chewing_scim-chewing.git_hullabaloo_README'], 4: [0.9964897891037261, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_webmail.git_hullabaloo_README'], 5: [0.9943880112670485, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines-activemodel_hullabaloo_README.md'], 6: [0.999759729377782, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md'], 7: [0.998666933112433, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_photutils.git_hullabaloo_README.rst'], 8: [0.9996679425883734, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md'], 9: [0.99815957978939, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/agmartin_linuxdoc-tools_hullabaloo_README'], 10: [0.9996663626936376, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt']}\n"
]
},
{
"data": {
"text/plain": [
"{0: [0.9998131703476353,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf'],\n",
" 1: [0.9936580635354768,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README'],\n",
" 2: [0.9992995657213791,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_muffin.git_hullabaloo_README'],\n",
" 3: [0.988192939654375,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chewing_scim-chewing.git_hullabaloo_README'],\n",
" 4: [0.9964897891037261,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_webmail.git_hullabaloo_README'],\n",
" 5: [0.9943880112670485,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines-activemodel_hullabaloo_README.md'],\n",
" 6: [0.999759729377782,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md'],\n",
" 7: [0.998666933112433,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_photutils.git_hullabaloo_README.rst'],\n",
" 8: [0.9996679425883734,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md'],\n",
" 9: [0.99815957978939,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/agmartin_linuxdoc-tools_hullabaloo_README'],\n",
" 10: [0.9996663626936376,\n",
" '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt']}"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_most_prevalent(data_vectorized, file_list)"
]
},
{
"cell_type": "code",
2025-02-04 01:06:13 +00:00
"execution_count": 24,
2025-02-02 21:42:09 +00:00
"id": "23468e82",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"349\n",
"-----------------------Topic 0 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"3551 0.984847 0.001515 0.001515 0.001515 0.001515 0.001515 0.001515 \n",
"3413 0.981059 0.001894 0.001894 0.001894 0.001894 0.001894 0.001894 \n",
"3396 0.973259 0.002674 0.002674 0.002674 0.002674 0.002675 0.002674 \n",
"1240 0.965032 0.003497 0.003497 0.003497 0.003497 0.003497 0.003497 \n",
"946 0.960470 0.003953 0.003953 0.003953 0.003953 0.003953 0.003953 \n",
"2914 0.958673 0.004133 0.004132 0.004133 0.004133 0.004132 0.004133 \n",
"225 0.954660 0.000918 0.000918 0.000918 0.000918 0.000918 0.000918 \n",
"2355 0.943176 0.005683 0.005682 0.005682 0.005682 0.005682 0.005682 \n",
"2913 0.943019 0.016762 0.000654 0.000654 0.000654 0.000654 0.000654 \n",
"901 0.942914 0.001421 0.001420 0.044301 0.001421 0.001421 0.001421 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"3551 0.001515 0.001515 0.001515 0.001515 \n",
"3413 0.001894 0.001894 0.001894 0.001894 \n",
"3396 0.002674 0.002674 0.002674 0.002674 \n",
"1240 0.003497 0.003497 0.003497 0.003497 \n",
"946 0.003953 0.003953 0.003953 0.003953 \n",
"2914 0.004133 0.004133 0.004133 0.004133 \n",
"225 0.000918 0.000918 0.037075 0.000918 \n",
"2355 0.005682 0.005682 0.005682 0.005682 \n",
"2913 0.000654 0.034986 0.000654 0.000654 \n",
"901 0.001421 0.001420 0.001421 0.001421 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/pytest-dev_pytest-runner.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/twisted_pydoctor_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/pd-externals_ggee_hullabaloo_README.ggext\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/biojava_biojava.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/LLNL_sundials.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ClusterLabs_pacemaker_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/wolever_parameterized_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jupyter_nbconvert_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/pymodbus-dev_pymodbus.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/zopefoundation_zope.proxy_hullabaloo_README.txt\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"734 0.000018 0.009437 0.000018 0.000018 0.000018 0.273398 0.000018 \n",
"1319 0.000018 0.583029 0.061516 0.331986 0.003909 0.019454 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"1106 0.000030 0.029005 0.708216 0.021411 0.000030 0.000030 0.166185 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"1435 0.000038 0.158357 0.000038 0.082379 0.000038 0.685618 0.000038 \n",
"2081 0.000039 0.278867 0.041405 0.000039 0.663507 0.000039 0.015947 \n",
"3589 0.000043 0.000043 0.625379 0.000043 0.001690 0.093226 0.000043 \n",
"831 0.000043 0.000043 0.150073 0.000043 0.000043 0.842702 0.000043 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"734 0.008753 0.000018 0.690342 0.017963 \n",
"1319 0.000018 0.000018 0.000018 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"1106 0.028543 0.000030 0.046489 0.000030 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"1435 0.005478 0.006674 0.061303 0.000038 \n",
"2081 0.000039 0.000039 0.000039 0.000039 \n",
"3589 0.000043 0.000043 0.279405 0.000043 \n",
"831 0.003522 0.000043 0.003402 0.000043 \n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_genius_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jmtd_wadc_hullabaloo_README.md\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 1 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"2395 0.000275 0.997253 0.000275 0.000275 0.000275 0.000275 0.000275 \n",
"80 0.000758 0.992423 0.000758 0.000758 0.000758 0.000758 0.000758 \n",
"534 0.000716 0.983133 0.000716 0.000716 0.010424 0.000716 0.000716 \n",
"62 0.000928 0.979982 0.000928 0.000928 0.011669 0.000928 0.000928 \n",
"1066 0.000866 0.971457 0.000866 0.000866 0.020751 0.000866 0.000866 \n",
"98 0.024042 0.967857 0.000900 0.000900 0.000900 0.000900 0.000900 \n",
"4070 0.001299 0.952846 0.001299 0.001299 0.001299 0.001299 0.001299 \n",
"3923 0.005051 0.949491 0.005051 0.005051 0.005051 0.005051 0.005051 \n",
"229 0.024377 0.947226 0.000168 0.000168 0.000168 0.000168 0.000168 \n",
"3313 0.005348 0.946520 0.005348 0.005348 0.005348 0.005348 0.005348 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"2395 0.000275 0.000275 0.000275 0.000275 \n",
"80 0.000758 0.000758 0.000758 0.000758 \n",
"534 0.000716 0.000716 0.000716 0.000716 \n",
"62 0.000928 0.000928 0.000928 0.000928 \n",
"1066 0.000866 0.000866 0.000866 0.000866 \n",
"98 0.000900 0.000900 0.000900 0.000900 \n",
"4070 0.035465 0.001299 0.001299 0.001299 \n",
"3923 0.005051 0.005051 0.005051 0.005051 \n",
"229 0.000168 0.000168 0.000168 0.027053 \n",
"3313 0.005348 0.005348 0.005348 0.005349 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/cdidier_irssi-xmpp_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/alsa-lib.git_hullabaloo_README.aconnect\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bolt_bolt_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/batctl.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/royhills_arp-scan_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ThomasHabets_arping_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/raboof_nethogs_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/python-zeroconf_python-zeroconf.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/lxc_lxc.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/troglobit_smcroute.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"3589 0.000043 0.000043 0.625379 0.000043 0.001690 0.093226 0.000043 \n",
"831 0.000043 0.000043 0.150073 0.000043 0.000043 0.842702 0.000043 \n",
"2060 0.236422 0.000045 0.312054 0.000045 0.365046 0.000045 0.000045 \n",
"2514 0.895267 0.000045 0.000045 0.000045 0.017315 0.000045 0.023678 \n",
"2065 0.192353 0.000046 0.581575 0.000046 0.000046 0.098829 0.000046 \n",
"3934 0.000048 0.000048 0.000048 0.000048 0.000048 0.945055 0.054512 \n",
"1907 0.000050 0.000050 0.014488 0.000050 0.000050 0.985059 0.000050 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"3589 0.000043 0.000043 0.279405 0.000043 \n",
"831 0.003522 0.000043 0.003402 0.000043 \n",
"2060 0.000045 0.000045 0.078141 0.008067 \n",
"2514 0.019573 0.000045 0.043894 0.000045 \n",
"2065 0.000046 0.000046 0.010390 0.116578 \n",
"3934 0.000048 0.000048 0.000048 0.000048 \n",
"1907 0.000050 0.000050 0.000050 0.000050 \n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_genius_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jmtd_wadc_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/EsotericSoftware_kryo.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enthought_mayavi.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jrincayc_ucblogo-code.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/trackballs_trackballs.git_hullabaloo_README.html\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 2 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"1202 0.000909 0.000909 0.990908 0.000909 0.000909 0.000909 0.000909 \n",
"2504 0.001151 0.001151 0.988492 0.001151 0.001151 0.001151 0.001151 \n",
"1512 0.001166 0.001166 0.988344 0.001166 0.001166 0.001166 0.001166 \n",
"1392 0.001894 0.001894 0.981059 0.001894 0.001894 0.001894 0.001894 \n",
"3590 0.000248 0.000248 0.951422 0.000248 0.000248 0.000248 0.000248 \n",
"2397 0.000301 0.055197 0.942094 0.000301 0.000301 0.000301 0.000301 \n",
"2999 0.000107 0.000107 0.928211 0.046276 0.000107 0.000107 0.000107 \n",
"1140 0.001399 0.001399 0.913971 0.001399 0.001399 0.001399 0.001399 \n",
"872 0.000999 0.000999 0.908382 0.000999 0.000999 0.000999 0.000999 \n",
"2833 0.000093 0.095750 0.903416 0.000093 0.000093 0.000093 0.000093 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"1202 0.000909 0.000909 0.000909 0.000909 \n",
"2504 0.001151 0.001151 0.001151 0.001151 \n",
"1512 0.001166 0.001166 0.001166 0.001166 \n",
"1392 0.001894 0.001894 0.001894 0.001894 \n",
"3590 0.000248 0.000248 0.000248 0.046342 \n",
"2397 0.000301 0.000301 0.000301 0.000301 \n",
"2999 0.000107 0.000107 0.024656 0.000107 \n",
"1140 0.001399 0.001399 0.001399 0.073441 \n",
"872 0.000999 0.000999 0.000999 0.082626 \n",
"2833 0.000093 0.000093 0.000093 0.000093 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mathiasbynens_unicode-property-value-aliases-ecmascript_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mathiasbynens_unicode-property-aliases-ecmascript_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mathiasbynens_regenerate-unicode-properties_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jdunck_python-unicodecsv_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/nichtich_RDF-NS.git_hullabaloo_README.mkdn\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/redis_hiredis_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bearded_ruby-ldap_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mathiasbynens_unicode-property-aliases_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mathiasbynens_unicode-property-value-aliases.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/nicolasff_webdis_hullabaloo_README.md\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"734 0.000018 0.009437 0.000018 0.000018 0.000018 0.273398 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"1604 0.003947 0.126398 0.000025 0.039761 0.000025 0.013216 0.735422 \n",
"3275 0.007322 0.141429 0.000032 0.000032 0.297121 0.451645 0.000032 \n",
"1435 0.000038 0.158357 0.000038 0.082379 0.000038 0.685618 0.000038 \n",
"2514 0.895267 0.000045 0.000045 0.000045 0.017315 0.000045 0.023678 \n",
"3934 0.000048 0.000048 0.000048 0.000048 0.000048 0.945055 0.054512 \n",
"1386 0.000050 0.659359 0.000050 0.082049 0.000050 0.000050 0.079319 \n",
"2133 0.000057 0.286587 0.000057 0.000057 0.000057 0.404221 0.090777 \n",
"789 0.000058 0.483836 0.000058 0.000058 0.000058 0.400501 0.000058 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"734 0.008753 0.000018 0.690342 0.017963 \n",
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"1604 0.005943 0.000025 0.075213 0.000025 \n",
"3275 0.000032 0.000032 0.102291 0.000032 \n",
"1435 0.005478 0.006674 0.061303 0.000038 \n",
"2514 0.019573 0.000045 0.043894 0.000045 \n",
"3934 0.000048 0.000048 0.000048 0.000048 \n",
"1386 0.178925 0.000050 0.000050 0.000050 \n",
"2133 0.033438 0.000057 0.184634 0.000057 \n",
"789 0.096216 0.000058 0.019038 0.000058 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jrincayc_ucblogo-code.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/arno-iptables-firewall_aif.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/blais_xxdiff.git_hullabaloo_README.build\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/swami_swami_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 3 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"1363 0.000739 0.000739 0.000739 0.992609 0.000739 0.000739 0.000739 \n",
"339 0.000805 0.000805 0.000805 0.991954 0.000805 0.000805 0.000805 \n",
"4200 0.000805 0.000805 0.000805 0.991954 0.000805 0.000805 0.000805 \n",
"3897 0.000819 0.000819 0.000819 0.991810 0.000819 0.000819 0.000819 \n",
"1693 0.000819 0.000819 0.000819 0.991810 0.000819 0.000819 0.000819 \n",
"1464 0.000850 0.000850 0.000850 0.991503 0.000850 0.000850 0.000850 \n",
"3669 0.000850 0.000850 0.000850 0.991503 0.000850 0.000850 0.000850 \n",
"1157 0.000978 0.000978 0.000978 0.990224 0.000978 0.000978 0.000978 \n",
"1825 0.000988 0.000988 0.000988 0.990118 0.000988 0.000988 0.000988 \n",
"3919 0.000988 0.000988 0.000988 0.990118 0.000988 0.000988 0.000988 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"1363 0.000739 0.000739 0.000739 0.000739 \n",
"339 0.000805 0.000805 0.000805 0.000805 \n",
"4200 0.000805 0.000805 0.000805 0.000805 \n",
"3897 0.000819 0.000819 0.000819 0.000819 \n",
"1693 0.000819 0.000819 0.000819 0.000819 \n",
"1464 0.000850 0.000850 0.000850 0.000850 \n",
"3669 0.000850 0.000850 0.000850 0.000850 \n",
"1157 0.000978 0.000978 0.000978 0.000978 \n",
"1825 0.000988 0.000988 0.000988 0.000988 \n",
"3919 0.000988 0.000988 0.000988 0.000988 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mate-desktop_mate-panel.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mate-desktop_mate-desktop.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_cinnamon-desktop.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mate-desktop_mate-menus.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_cinnamon-menus.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mate-desktop_mate-screensaver.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_cinnamon-screensaver.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_libgudev_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mate-desktop_mate-session-manager.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_cinnamon-session.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"734 0.000018 0.009437 0.000018 0.000018 0.000018 0.273398 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"375 0.117647 0.046106 0.014596 0.000024 0.000024 0.131388 0.000024 \n",
"3275 0.007322 0.141429 0.000032 0.000032 0.297121 0.451645 0.000032 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"2081 0.000039 0.278867 0.041405 0.000039 0.663507 0.000039 0.015947 \n",
"3589 0.000043 0.000043 0.625379 0.000043 0.001690 0.093226 0.000043 \n",
"831 0.000043 0.000043 0.150073 0.000043 0.000043 0.842702 0.000043 \n",
"3483 0.000044 0.767987 0.136634 0.000044 0.009711 0.000044 0.000044 \n",
"2060 0.236422 0.000045 0.312054 0.000045 0.365046 0.000045 0.000045 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"734 0.008753 0.000018 0.690342 0.017963 \n",
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"375 0.000024 0.000024 0.004995 0.685147 \n",
"3275 0.000032 0.000032 0.102291 0.000032 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"2081 0.000039 0.000039 0.000039 0.000039 \n",
"3589 0.000043 0.000043 0.279405 0.000043 \n",
"831 0.003522 0.000043 0.003402 0.000043 \n",
"3483 0.000044 0.000044 0.085361 0.000044 \n",
"2060 0.000045 0.000045 0.078141 0.008067 \n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_genius_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jmtd_wadc_hullabaloo_README.md\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/OpenPrinting_foomatic-db.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/EsotericSoftware_kryo.git_hullabaloo_README.md\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 4 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"226 0.010102 0.010103 0.010104 0.010103 0.898973 0.010103 0.010103 \n",
"1042 0.022732 0.022729 0.022732 0.022727 0.772689 0.022727 0.022735 \n",
"4129 0.030303 0.030303 0.030303 0.030308 0.696963 0.030303 0.030303 \n",
"3097 0.030303 0.030303 0.030303 0.030303 0.696961 0.030303 0.030303 \n",
"2081 0.000039 0.278867 0.041405 0.000039 0.663507 0.000039 0.015947 \n",
"3127 0.018183 0.018183 0.018182 0.018184 0.606181 0.018182 0.018184 \n",
"2259 0.045455 0.045455 0.045455 0.045455 0.545455 0.045455 0.045455 \n",
"1226 0.045455 0.045455 0.045455 0.045455 0.545452 0.045455 0.045455 \n",
"2953 0.045455 0.045456 0.045455 0.045456 0.545443 0.045455 0.045456 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"226 0.010101 0.010104 0.010102 0.010102 \n",
"1042 0.022732 0.022735 0.022731 0.022730 \n",
"4129 0.030305 0.030303 0.030303 0.030303 \n",
"3097 0.030303 0.030303 0.030303 0.030311 \n",
"2081 0.000039 0.000039 0.000039 0.000039 \n",
"3127 0.230172 0.018182 0.018184 0.018183 \n",
"2259 0.045455 0.045455 0.045455 0.045455 \n",
"1226 0.045455 0.045455 0.045457 0.045455 \n",
"2953 0.045455 0.045455 0.045456 0.045459 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/egh_ledger-autosync_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/fcitx_fcitx-libpinyin_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ukui_ukui-indicators_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ukui_ukui-sidebar_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/phihag_ipaddress_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jpy-consortium_jpy_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ntop_nDPI.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ejeschke_ginga.git_hullabaloo_README.rst\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"734 0.000018 0.009437 0.000018 0.000018 0.000018 0.273398 0.000018 \n",
"375 0.117647 0.046106 0.014596 0.000024 0.000024 0.131388 0.000024 \n",
"1604 0.003947 0.126398 0.000025 0.039761 0.000025 0.013216 0.735422 \n",
"1106 0.000030 0.029005 0.708216 0.021411 0.000030 0.000030 0.166185 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"1435 0.000038 0.158357 0.000038 0.082379 0.000038 0.685618 0.000038 \n",
"831 0.000043 0.000043 0.150073 0.000043 0.000043 0.842702 0.000043 \n",
"2468 0.013769 0.908888 0.020981 0.026627 0.000045 0.000045 0.000045 \n",
"2065 0.192353 0.000046 0.581575 0.000046 0.000046 0.098829 0.000046 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"734 0.008753 0.000018 0.690342 0.017963 \n",
"375 0.000024 0.000024 0.004995 0.685147 \n",
"1604 0.005943 0.000025 0.075213 0.000025 \n",
"1106 0.028543 0.000030 0.046489 0.000030 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"1435 0.005478 0.006674 0.061303 0.000038 \n",
"831 0.003522 0.000043 0.003402 0.000043 \n",
"2468 0.005386 0.002043 0.022125 0.000045 \n",
"2065 0.000046 0.000046 0.010390 0.116578 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jmtd_wadc_hullabaloo_README.md\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ppp-project_ppp_hullabaloo_README.linux\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enthought_mayavi.git_hullabaloo_README.txt\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 5 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"1907 0.000050 0.000050 0.014488 0.000050 0.000050 0.985059 0.000050 \n",
"421 0.003367 0.003368 0.003367 0.003368 0.003367 0.966327 0.003367 \n",
"2871 0.003788 0.003788 0.003788 0.003788 0.003788 0.962118 0.003788 \n",
"3904 0.003788 0.003788 0.003788 0.003788 0.003788 0.962118 0.003788 \n",
"4036 0.003788 0.003788 0.003788 0.003788 0.003788 0.962118 0.003788 \n",
"3934 0.000048 0.000048 0.000048 0.000048 0.000048 0.945055 0.054512 \n",
"3842 0.009091 0.009093 0.009091 0.009092 0.009091 0.909085 0.009091 \n",
"1884 0.000179 0.019765 0.000179 0.069225 0.000179 0.897342 0.000179 \n",
"1596 0.080111 0.000526 0.000526 0.000526 0.010355 0.897019 0.000526 \n",
"3193 0.032556 0.000315 0.000315 0.032793 0.000315 0.864536 0.000315 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"1907 0.000050 0.000050 0.000050 0.000050 \n",
"421 0.003368 0.003367 0.003367 0.003367 \n",
"2871 0.003789 0.003788 0.003788 0.003788 \n",
"3904 0.003789 0.003788 0.003788 0.003788 \n",
"4036 0.003789 0.003788 0.003788 0.003788 \n",
"3934 0.000048 0.000048 0.000048 0.000048 \n",
"3842 0.009091 0.009091 0.009092 0.009091 \n",
"1884 0.012415 0.000179 0.000179 0.000179 \n",
"1596 0.000525 0.000526 0.008836 0.000526 \n",
"3193 0.000315 0.000315 0.067913 0.000315 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/trackballs_trackballs.git_hullabaloo_README.html\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kodi-pvr_pvr.iptvsimple.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kodi-pvr_pvr.njoy.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kodi-pvr_pvr.hdhomerun.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kodi-pvr_pvr.dvbviewer.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jrincayc_ucblogo-code.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chrender_fizmo-ncursesw_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/callaa_luola.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mawww_kakoune.git_hullabaloo_README.asciidoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bomber.git_hullabaloo_README.themes\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"3560 0.420193 0.034504 0.322584 0.020994 0.002791 0.000024 0.000024 \n",
"1106 0.000030 0.029005 0.708216 0.021411 0.000030 0.000030 0.166185 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"2081 0.000039 0.278867 0.041405 0.000039 0.663507 0.000039 0.015947 \n",
"3483 0.000044 0.767987 0.136634 0.000044 0.009711 0.000044 0.000044 \n",
"2060 0.236422 0.000045 0.312054 0.000045 0.365046 0.000045 0.000045 \n",
"2468 0.013769 0.908888 0.020981 0.026627 0.000045 0.000045 0.000045 \n",
"2514 0.895267 0.000045 0.000045 0.000045 0.017315 0.000045 0.023678 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"3560 0.015793 0.104444 0.078624 0.000024 \n",
"1106 0.028543 0.000030 0.046489 0.000030 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"2081 0.000039 0.000039 0.000039 0.000039 \n",
"3483 0.000044 0.000044 0.085361 0.000044 \n",
"2060 0.000045 0.000045 0.078141 0.008067 \n",
"2468 0.005386 0.002043 0.022125 0.000045 \n",
"2514 0.019573 0.000045 0.043894 0.000045 \n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/OpenPrinting_foomatic-db.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/EsotericSoftware_kryo.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ppp-project_ppp_hullabaloo_README.linux\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 6 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"289 0.000215 0.000215 0.000215 0.000215 0.000215 0.000215 0.997846 \n",
"1141 0.001122 0.001122 0.001122 0.001122 0.001122 0.001122 0.988775 \n",
"2865 0.001421 0.001421 0.001421 0.001421 0.001420 0.001421 0.985794 \n",
"2572 0.001653 0.001653 0.001653 0.001653 0.001653 0.001653 0.983470 \n",
"1750 0.001894 0.001894 0.001894 0.001894 0.001894 0.001894 0.981059 \n",
"510 0.003247 0.003247 0.003247 0.003247 0.003247 0.003247 0.967528 \n",
"2034 0.003497 0.003497 0.003497 0.003497 0.003497 0.003497 0.965033 \n",
"291 0.003789 0.003788 0.003788 0.003788 0.003788 0.003788 0.962118 \n",
"156 0.004133 0.004133 0.004133 0.004133 0.004132 0.004132 0.958675 \n",
"3778 0.005682 0.005682 0.005682 0.005682 0.005682 0.005682 0.943176 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"289 0.000215 0.000215 0.000215 0.000215 \n",
"1141 0.001122 0.001122 0.001123 0.001122 \n",
"2865 0.001421 0.001421 0.001421 0.001421 \n",
"2572 0.001653 0.001653 0.001653 0.001653 \n",
"1750 0.001894 0.001894 0.001894 0.001894 \n",
"510 0.003248 0.003247 0.003247 0.003247 \n",
"2034 0.003497 0.003497 0.003497 0.003497 \n",
"291 0.003788 0.003788 0.003788 0.003788 \n",
"156 0.004132 0.004132 0.004133 0.004132 \n",
"3778 0.005683 0.005682 0.005683 0.005683 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/wanderlust_semi_hullabaloo_README.en\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ActiveState_appdirs.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/df7cb_sdate_hullabaloo_README.fake\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/gosa-project_gosa-core_hullabaloo_README.safemode\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tkf_emacs-python-environment.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/hhatto_autopep8.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/skk-dev_skktools_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ganglia_ganglia-modules-linux.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/zevv_duc_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/slicer69_sysvinit_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"734 0.000018 0.009437 0.000018 0.000018 0.000018 0.273398 0.000018 \n",
"1319 0.000018 0.583029 0.061516 0.331986 0.003909 0.019454 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"375 0.117647 0.046106 0.014596 0.000024 0.000024 0.131388 0.000024 \n",
"3560 0.420193 0.034504 0.322584 0.020994 0.002791 0.000024 0.000024 \n",
"3275 0.007322 0.141429 0.000032 0.000032 0.297121 0.451645 0.000032 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"1435 0.000038 0.158357 0.000038 0.082379 0.000038 0.685618 0.000038 \n",
"3589 0.000043 0.000043 0.625379 0.000043 0.001690 0.093226 0.000043 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"734 0.008753 0.000018 0.690342 0.017963 \n",
"1319 0.000018 0.000018 0.000018 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"375 0.000024 0.000024 0.004995 0.685147 \n",
"3560 0.015793 0.104444 0.078624 0.000024 \n",
"3275 0.000032 0.000032 0.102291 0.000032 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"1435 0.005478 0.006674 0.061303 0.000038 \n",
"3589 0.000043 0.000043 0.279405 0.000043 \n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_genius_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 7 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"1146 0.000457 0.000457 0.000457 0.000457 0.000457 0.000457 0.000457 \n",
"4245 0.000544 0.000544 0.000544 0.000544 0.000544 0.000544 0.000544 \n",
"2853 0.000558 0.000558 0.000558 0.000558 0.000558 0.000558 0.000558 \n",
"3625 0.000587 0.000587 0.000587 0.000587 0.000587 0.000587 0.000587 \n",
"1663 0.000598 0.000598 0.000598 0.000598 0.000598 0.000598 0.000598 \n",
"2567 0.000623 0.000623 0.000623 0.000623 0.000623 0.000623 0.000623 \n",
"1779 0.000699 0.000699 0.000699 0.000699 0.000699 0.000699 0.000699 \n",
"1265 0.000777 0.000777 0.000777 0.000777 0.000777 0.000777 0.000777 \n",
"2103 0.000812 0.000812 0.000812 0.000812 0.000812 0.000812 0.000812 \n",
"4023 0.000834 0.000834 0.000834 0.000834 0.000834 0.000834 0.000834 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"1146 0.995431 0.000457 0.000457 0.000457 \n",
"4245 0.994556 0.000544 0.000544 0.000544 \n",
"2853 0.994422 0.000558 0.000558 0.000558 \n",
"3625 0.994135 0.000587 0.000587 0.000587 \n",
"1663 0.994019 0.000598 0.000598 0.000598 \n",
"2567 0.993773 0.000623 0.000623 0.000623 \n",
"1779 0.993007 0.000699 0.000699 0.000699 \n",
"1265 0.992230 0.000777 0.000777 0.000777 \n",
"2103 0.991883 0.000812 0.000812 0.000812 \n",
"4023 0.991659 0.000834 0.000834 0.000834 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_imp.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_kronolith.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tkem_mopidy-dleyna.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mopidy_mopidy-alsamixer_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_sesha.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mopidy_mopidy-mpris.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tkem_mopidy-internetarchive_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kingosticks_mopidy-tunein_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tkem_mopidy-podcast-itunes_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_trean.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"1319 0.000018 0.583029 0.061516 0.331986 0.003909 0.019454 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"375 0.117647 0.046106 0.014596 0.000024 0.000024 0.131388 0.000024 \n",
"3275 0.007322 0.141429 0.000032 0.000032 0.297121 0.451645 0.000032 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"2081 0.000039 0.278867 0.041405 0.000039 0.663507 0.000039 0.015947 \n",
"3589 0.000043 0.000043 0.625379 0.000043 0.001690 0.093226 0.000043 \n",
"3483 0.000044 0.767987 0.136634 0.000044 0.009711 0.000044 0.000044 \n",
"2060 0.236422 0.000045 0.312054 0.000045 0.365046 0.000045 0.000045 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"1319 0.000018 0.000018 0.000018 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"375 0.000024 0.000024 0.004995 0.685147 \n",
"3275 0.000032 0.000032 0.102291 0.000032 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"2081 0.000039 0.000039 0.000039 0.000039 \n",
"3589 0.000043 0.000043 0.279405 0.000043 \n",
"3483 0.000044 0.000044 0.085361 0.000044 \n",
"2060 0.000045 0.000045 0.078141 0.008067 \n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_genius_hullabaloo_README\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/OpenPrinting_foomatic-db.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/EsotericSoftware_kryo.git_hullabaloo_README.md\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 8 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"1118 0.001541 0.001541 0.001541 0.001541 0.001541 0.001541 0.001541 \n",
"1099 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100 \n",
"2769 0.003031 0.003031 0.003030 0.003031 0.003030 0.003031 0.003031 \n",
"1427 0.019060 0.000123 0.000123 0.000123 0.022132 0.000123 0.000123 \n",
"3158 0.006994 0.006993 0.006993 0.006994 0.006993 0.006993 0.006993 \n",
"1537 0.031350 0.001337 0.001337 0.001337 0.027705 0.029067 0.001337 \n",
"3399 0.010101 0.010103 0.010101 0.010101 0.010101 0.010101 0.010101 \n",
"2492 0.011364 0.011367 0.011364 0.011364 0.011364 0.011365 0.011365 \n",
"2617 0.004329 0.004329 0.004329 0.004329 0.004329 0.004330 0.079500 \n",
"2315 0.012988 0.012987 0.012988 0.012987 0.012987 0.012988 0.012988 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"1118 0.001541 0.984591 0.001541 0.001541 \n",
"1099 0.019750 0.979350 0.000100 0.000100 \n",
"2769 0.003031 0.969694 0.003031 0.003031 \n",
"1427 0.000123 0.957827 0.000123 0.000123 \n",
"3158 0.006993 0.930067 0.006993 0.006993 \n",
"1537 0.001337 0.902518 0.001337 0.001337 \n",
"3399 0.010101 0.898986 0.010101 0.010101 \n",
"2492 0.011364 0.886356 0.011364 0.011364 \n",
"2617 0.004329 0.881536 0.004329 0.004330 \n",
"2315 0.012987 0.870125 0.012988 0.012987 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/tax_python-requests-aws_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bestpractical_rt-extension-repeatticket_hullabaloo_README.pod\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/ericvsmith_toposort_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/zapier_django-rest-hooks_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chibisov_drf-extensions.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bfirsh_django-ordered-model.git_hullabaloo_README.markdown\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/mikeal_tunnel-agent_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Castaglia_proftpd-mod_vroot.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/LeaVerou_prism.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/unbit_django-uwsgi_hullabaloo_README.md\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"734 0.000018 0.009437 0.000018 0.000018 0.000018 0.273398 0.000018 \n",
"1319 0.000018 0.583029 0.061516 0.331986 0.003909 0.019454 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"375 0.117647 0.046106 0.014596 0.000024 0.000024 0.131388 0.000024 \n",
"1604 0.003947 0.126398 0.000025 0.039761 0.000025 0.013216 0.735422 \n",
"1106 0.000030 0.029005 0.708216 0.021411 0.000030 0.000030 0.166185 \n",
"3275 0.007322 0.141429 0.000032 0.000032 0.297121 0.451645 0.000032 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"2081 0.000039 0.278867 0.041405 0.000039 0.663507 0.000039 0.015947 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"734 0.008753 0.000018 0.690342 0.017963 \n",
"1319 0.000018 0.000018 0.000018 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"375 0.000024 0.000024 0.004995 0.685147 \n",
"1604 0.005943 0.000025 0.075213 0.000025 \n",
"1106 0.028543 0.000030 0.046489 0.000030 \n",
"3275 0.000032 0.000032 0.102291 0.000032 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"2081 0.000039 0.000039 0.000039 0.000039 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/audacity_audacity.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bluca_gsl_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"-----------------------Topic 9 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"4236 0.000083 0.000083 0.000083 0.000083 0.000083 0.000083 0.000083 \n",
"3501 0.000110 0.000110 0.000110 0.000110 0.000110 0.000110 0.000110 \n",
"2181 0.000145 0.000145 0.000145 0.000145 0.000145 0.000145 0.000145 \n",
"2877 0.000255 0.000255 0.000255 0.000255 0.000255 0.000255 0.000255 \n",
"559 0.000283 0.000283 0.000283 0.000283 0.000283 0.000283 0.000283 \n",
"2673 0.000722 0.000722 0.000722 0.000722 0.000722 0.000722 0.000722 \n",
"2255 0.000957 0.000958 0.000957 0.000957 0.000957 0.000957 0.000957 \n",
"3216 0.001010 0.001010 0.001010 0.001010 0.001010 0.001010 0.001010 \n",
"3285 0.001010 0.001010 0.001010 0.001010 0.001010 0.001010 0.001010 \n",
"1546 0.001022 0.001022 0.001021 0.001022 0.001021 0.001021 0.001022 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"4236 0.000083 0.000083 0.999171 0.000083 \n",
"3501 0.000110 0.000110 0.998901 0.000110 \n",
"2181 0.000145 0.000145 0.998550 0.000145 \n",
"2877 0.000255 0.000255 0.997453 0.000255 \n",
"559 0.000283 0.000283 0.997168 0.000283 \n",
"2673 0.000722 0.000722 0.992784 0.000722 \n",
"2255 0.000957 0.000957 0.990429 0.000957 \n",
"3216 0.001010 0.001010 0.989898 0.001010 \n",
"3285 0.001010 0.001010 0.989898 0.001010 \n",
"1546 0.001022 0.001022 0.989785 0.001022 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/osslugaru_lugaru_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/michaelrsweet_htmldoc.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/glennrp_libpng_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/silx-kit_fabio.git_hullabaloo_README\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/gyunaev_kchmviewer_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/book_Test-Database.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sebastianbergmann_php-invoker_hullabaloo_README.markdown\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sebastianbergmann_php-timer_hullabaloo_README.markdown\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sebastianbergmann_php-file-iterator_hullabaloo_README.markdown\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/bobtfish_directory-scratch.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"1319 0.000018 0.583029 0.061516 0.331986 0.003909 0.019454 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"2081 0.000039 0.278867 0.041405 0.000039 0.663507 0.000039 0.015947 \n",
"3934 0.000048 0.000048 0.000048 0.000048 0.000048 0.945055 0.054512 \n",
"2954 0.308152 0.151885 0.017462 0.026990 0.000050 0.003691 0.491573 \n",
"1386 0.000050 0.659359 0.000050 0.082049 0.000050 0.000050 0.079319 \n",
"1907 0.000050 0.000050 0.014488 0.000050 0.000050 0.985059 0.000050 \n",
"112 0.000051 0.000051 0.336407 0.000051 0.000051 0.663130 0.000051 \n",
"2565 0.000053 0.355726 0.097612 0.000053 0.000053 0.079875 0.466419 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"1319 0.000018 0.000018 0.000018 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"2081 0.000039 0.000039 0.000039 0.000039 \n",
"3934 0.000048 0.000048 0.000048 0.000048 \n",
"2954 0.000050 0.000050 0.000050 0.000050 \n",
"1386 0.178925 0.000050 0.000050 0.000050 \n",
"1907 0.000050 0.000050 0.000050 0.000050 \n",
"112 0.000051 0.000051 0.000051 0.000051 \n",
"2565 0.000053 0.000053 0.000053 0.000053 \n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jrincayc_ucblogo-code.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jirka-h_haveged_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/arno-iptables-firewall_aif.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/trackballs_trackballs.git_hullabaloo_README.html\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/GNOME_evolution_hullabaloo_README.TXT\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/punitagrawal_global_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"-----------------------Topic 10 --------------------------------\n",
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"122 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 \n",
"2996 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 \n",
"4117 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 0.000133 \n",
"2682 0.000805 0.000805 0.000805 0.000805 0.000805 0.000805 0.000805 \n",
"2191 0.000850 0.000850 0.000850 0.000850 0.000850 0.000850 0.000850 \n",
"1129 0.000858 0.000858 0.000858 0.000858 0.000858 0.000858 0.000858 \n",
"3720 0.000928 0.000928 0.000928 0.000928 0.000928 0.000928 0.000928 \n",
"3739 0.000937 0.000937 0.000937 0.000937 0.000937 0.000937 0.000937 \n",
"935 0.000937 0.000937 0.000937 0.000937 0.000937 0.000937 0.000937 \n",
"1335 0.000947 0.000947 0.000947 0.000947 0.000947 0.000947 0.000947 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"122 0.000133 0.000133 0.000133 0.998667 \n",
"2996 0.000133 0.000133 0.000133 0.998667 \n",
"4117 0.000133 0.000133 0.000133 0.998667 \n",
"2682 0.000805 0.000805 0.000805 0.991954 \n",
"2191 0.000850 0.000850 0.000850 0.991503 \n",
"1129 0.000858 0.000858 0.000858 0.991423 \n",
"3720 0.000928 0.000928 0.000928 0.990723 \n",
"3739 0.000937 0.000937 0.000937 0.990628 \n",
"935 0.000937 0.000937 0.000937 0.990628 \n",
"1335 0.000947 0.000947 0.000947 0.990530 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_photutils.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_astroquery.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/astropy_ccdproc.git_hullabaloo_README.rst\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/nxt-firmware.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/seattlerb_rubyinline.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/gazay_gon_hullabaloo_README.md\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/seattlerb_ruby_parser.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/faye_faye_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jtzemp_base62.git_hullabaloo_README.rdoc\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/seattlerb_sexp_processor_hullabaloo_README.txt\n",
2025-02-02 21:42:09 +00:00
" 0 1 2 3 4 5 6 \\\n",
2025-02-04 01:06:13 +00:00
"1319 0.000018 0.583029 0.061516 0.331986 0.003909 0.019454 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 0.999813 0.000019 0.000019 \n",
"3560 0.420193 0.034504 0.322584 0.020994 0.002791 0.000024 0.000024 \n",
"1604 0.003947 0.126398 0.000025 0.039761 0.000025 0.013216 0.735422 \n",
"1106 0.000030 0.029005 0.708216 0.021411 0.000030 0.000030 0.166185 \n",
"3275 0.007322 0.141429 0.000032 0.000032 0.297121 0.451645 0.000032 \n",
"2154 0.000033 0.000033 0.549309 0.450392 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.346634 0.000033 0.000033 0.000033 0.000033 \n",
"1435 0.000038 0.158357 0.000038 0.082379 0.000038 0.685618 0.000038 \n",
"2081 0.000039 0.278867 0.041405 0.000039 0.663507 0.000039 0.015947 \n",
2025-02-02 21:42:09 +00:00
"\n",
" 7 8 9 10 \n",
2025-02-04 01:06:13 +00:00
"1319 0.000018 0.000018 0.000018 0.000018 \n",
"3299 0.000019 0.000019 0.000019 0.000019 \n",
"3560 0.015793 0.104444 0.078624 0.000024 \n",
"1604 0.005943 0.000025 0.075213 0.000025 \n",
"1106 0.028543 0.000030 0.046489 0.000030 \n",
"3275 0.000032 0.000032 0.102291 0.000032 \n",
"2154 0.000033 0.000033 0.000033 0.000033 \n",
"3771 0.000033 0.000033 0.653065 0.000033 \n",
"1435 0.005478 0.006674 0.061303 0.000038 \n",
"2081 0.000039 0.000039 0.000039 0.000039 \n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/andrikos_kismet-debian_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/suds-community_suds.git_hullabaloo_README.rst\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/Xastir_Xastir.git_hullabaloo_README.1ST\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/darold_ora2pg.git_hullabaloo_README\n",
2025-02-02 21:42:09 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/smbolton_whysynth.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/enova_pg_fact_loader.git_hullabaloo_README.md\n",
2025-02-04 01:06:13 +00:00
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/jazzband_inflect.git_hullabaloo_README.txt\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/iortcw_iortcw.git_hullabaloo_README\n",
"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/epam_nfstrace_hullabaloo_README\n",
"0 0.097433\n",
"1 0.101780\n",
"2 0.081539\n",
"3 0.130848\n",
"4 0.024088\n",
"5 0.061218\n",
"6 0.089808\n",
"7 0.134225\n",
"8 0.059773\n",
"9 0.132976\n",
"10 0.086312\n",
2025-02-02 21:42:09 +00:00
"dtype: float64\n"
]
}
],
"source": [
"prevalent_topics(data_vectorized, file_list)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}