{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e09a84d6-cbd4-4a12-8e96-3775f734a262", "metadata": {}, "outputs": [], "source": [ "import re\n", "import numpy as np\n", "import pandas as pd\n", "import glob\n", "import copy\n", "import csv\n", "from statistics import mean, median\n", "from strip_markdown import strip_markdown\n", "import joblib" ] }, { "cell_type": "code", "execution_count": 2, "id": "9483091c-ac72-415c-932d-ac7cf7970789", "metadata": {}, "outputs": [], "source": [ "import gensim\n", "import gensim.corpora as corpora\n", "from gensim.utils import simple_preprocess\n", "from gensim.models import CoherenceModel\n", "from gensim.models.phrases import Phrases\n", "\n", "from sklearn.decomposition import LatentDirichletAllocation\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "\n", "from statistics import mode\n", "\n", "from collections import defaultdict" ] }, { "cell_type": "code", "execution_count": 3, "id": "196abd6a", "metadata": {}, "outputs": [], "source": [ "#import nltk\n", "#nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": 4, "id": "3da6b590-875d-478d-aaaa-de020039c519", "metadata": {}, "outputs": [], "source": [ "# spacy and nltk for lemmatization\n", "import nltk \n", "#nltk.download('stopwords')\n", "import spacy\n", "from nltk.corpus import stopwords\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "\n", "stopwords = stopwords.words('english')" ] }, { "cell_type": "code", "execution_count": 15, "id": "60c137ae-6fe9-4b03-b899-6141b1645d6b", "metadata": {}, "outputs": [], "source": [ "def metadata_for_file(file):\n", " word_list = file.split()\n", " word_count = len(word_list)\n", " #print(word_list)\n", " if word_count == 0:\n", " avg_word_length = 0\n", " else: \n", " avg_word_length = sum(map(len, word_list)) / len(word_list)\n", " #return number of paragraphs\n", " return word_count, avg_word_length, word_list" ] }, { "cell_type": "code", "execution_count": 17, "id": "2e674fef-adb4-48c9-86a0-a655c41a95f3", "metadata": {}, "outputs": [], "source": [ "def get_data_from_dir(directory):\n", " files = glob.glob(f\"{directory}/*\")\n", " data_list = []\n", " word_counts = []\n", " avg_word_lengths = []\n", " file_list = []\n", " files_word_lists = defaultdict(list)\n", " for file in files:\n", " text = open(file, encoding='utf-8', errors='ignore').read()\n", " #here's some of the descriptive text analysis\n", " word_count, avg_word_length, word_list = metadata_for_file(text)\n", " word_counts.append(word_count)\n", " avg_word_lengths.append(avg_word_length)\n", " #adding the data to the list of text\n", " if word_count > 0:\n", " files_word_lists[tuple(word_list)].append(file)\n", " data_list.append(text)\n", " #adding filename\n", " file_list.append(file)\n", " return data_list, word_counts, avg_word_lengths, file_list, " ] }, { "cell_type": "code", "execution_count": 7, "id": "2b332b10-bfc8-4566-8c52-19a8a334af00", "metadata": {}, "outputs": [], "source": [ "#preprocessing text data\n", "def preprocess(corpus_list):\n", " #extending stopwords \n", " specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n", " \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n", " stopwords.extend(specific_stopwords)\n", " D = copy.copy(corpus_list)\n", " #stripping markdown from documents\n", " D = [strip_markdown(doc) for doc in D]\n", " #strip html \n", " D = [re.sub(r'<[^<]+?>', '', doc, flags=re.DOTALL) for doc in D]\n", " #mvp right now, can certainly be expanded as iterations of text analysis are done\n", " D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n", " lemmatizer = WordNetLemmatizer()\n", " D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n", " return D_lemma" ] }, { "cell_type": "code", "execution_count": 8, "id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7", "metadata": {}, "outputs": [], "source": [ "#preparing processed data for model usage\n", "def text_preparation(lemmatized_text):\n", " #bigrams\n", " D_bigrams = copy.copy(lemmatized_text)\n", " bigram = Phrases(D_bigrams, min_count=2)\n", " for i in range(len(lemmatized_text)):\n", " for token in bigram[D_bigrams[i]]:\n", " if '_' in token:\n", " D_bigrams[i].append(token)\n", " #id2word\n", " id2word = corpora.Dictionary(D_bigrams)\n", " id2word.filter_extremes(no_below=5, no_above=0.5)\n", " #bow representation \n", " bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n", " return bag_of_words, id2word" ] }, { "cell_type": "code", "execution_count": 9, "id": "24799e25-2c0c-4e16-b503-68296f604f52", "metadata": {}, "outputs": [], "source": [ "def lda_model_identification(data_vectorized):\n", " lda = LatentDirichletAllocation()\n", " search_params = {'n_components': [9], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] }\n", " model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n", " model.fit(data_vectorized)\n", " best_lda_model = model.best_estimator_\n", " print(\"Best Model's Params: \", model.best_params_)\n", " print(\"Best Log Likelihood Score: \", model.best_score_)\n", " print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))" ] }, { "cell_type": "code", "execution_count": 22, "id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5", "metadata": {}, "outputs": [], "source": [ "def best_lda_model(data_vectorized, vocab):\n", " lda = LatentDirichletAllocation(n_components=9, learning_decay = 0.7, batch_size = 128, max_iter = 50)\n", " id_topic = lda.fit_transform(data_vectorized)\n", " topic_words = {}\n", " for topic, comp in enumerate(lda.components_):\n", " word_idx = np.argsort(comp)[::-1][:10]\n", " topic_words[topic] = [vocab[i] for i in word_idx]\n", " for topic, words in topic_words.items():\n", " print('Topic: %d' % topic)\n", " print(' %s' % ', '.join(words))\n", " #lda.print_topics(num_words=10)\n", " joblib.dump(lda, '020325_README_lda.jl')\n", " #lda = joblib.load('0509_lda.jl')\n", " return id_topic" ] }, { "cell_type": "code", "execution_count": 31, "id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27", "metadata": {}, "outputs": [], "source": [ "def get_most_prevalent(vect_documents, documents):\n", " lda = joblib.load('020725_README_lda.jl')\n", " distributions = lda.transform(vect_documents)\n", " most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"], 7: [0, \"\"], 8: [0, \"\"]}\n", " for i, topic_distribution in enumerate(distributions):\n", " for j in range(9):\n", " if topic_distribution[j] > most_prevalent[j][0]:\n", " most_prevalent[j] = [topic_distribution[j], documents[i]]\n", " print(most_prevalent)\n", " return most_prevalent\n" ] }, { "cell_type": "code", "execution_count": 35, "id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec", "metadata": {}, "outputs": [], "source": [ "def prevalent_topics(vect_documents, file_list):\n", " lda = joblib.load('020725_README_lda.jl')\n", " #lda = joblib.load('0514_contrib_lda.jl')\n", " distributions = lda.transform(vect_documents)\n", " #figuring out what the max distribution is and then figuring out the mode\n", " top_topic = []\n", " count_of_multiple = 0\n", " topic_arrays = []\n", " for i, topic_distribution in enumerate(distributions):\n", " max_dist = max(topic_distribution)\n", " indexes = np.where(topic_distribution == max_dist)[0]\n", " if len(indexes) == 1:\n", " top_topic.append(indexes[0])\n", " else:\n", " count_of_multiple += 1\n", " topic_arrays.append(topic_distribution)\n", " #most_frequent(top_topic)\n", " print(count_of_multiple)\n", " df = pd.DataFrame(topic_arrays)\n", " #finding the distribution values for all documents\n", " with open('020725_README_file_topic_distributions.csv', 'w', newline='') as csvfile:\n", " fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8']\n", " writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n", " writer.writeheader()\n", " for i, row in df.iterrows():\n", " project_dir = {}\n", " project_dir['filename'] = file_list[i].split(\"/\")[-1]\n", " array_row = df.iloc[i].to_numpy()\n", " for j in range(9):\n", " project_dir[\"t\" + str(j)] = array_row[j]\n", " writer.writerow(project_dir)\n", " #print(df.sort_values(by=['0']).head(5))\n", " for i in range(9):\n", " print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n", " top5 = df.nlargest(10, i)\n", " top_indices = top5.index.to_list()\n", " print(top5)\n", " for index in top_indices:\n", " print(file_list[index])\n", " bottom5 = df.nsmallest(10, i)\n", " bottom_indices = bottom5.index.to_list()\n", " print(bottom5)\n", " for index in bottom_indices:\n", " print(file_list[index])\n", " averages = df.mean()\n", " print(averages)\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3", "metadata": {}, "outputs": [], "source": [ "def most_frequent(topic_prevalence):\n", " most_frequent_array = []\n", " for j in range(11):\n", " topic = mode(topic_prevalence)\n", " most_frequent_array.append(topic)\n", " topic_prevalence = [i for i in topic_prevalence if i != topic]\n", " print(most_frequent_array)" ] }, { "cell_type": "code", "execution_count": 13, "id": "69d606fd", "metadata": {}, "outputs": [], "source": [ "readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/\"" ] }, { "cell_type": "code", "execution_count": 18, "id": "1f937c2e-2714-475d-b670-602164c46642", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean wordcount: 324.0929957406531\n", "Median wordcount: 156.0\n", "Mean wordlength: 6.354120246310486\n", "Median wordlength: 5.950514528900827\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/SOC.NORTHWESTERN.EDU/nws8519/anaconda3/lib/python3.12/html/parser.py:171: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n", " k = self.parse_starttag(i)\n" ] } ], "source": [ "listed_corpus, wordcounts, wordlengths, file_list= get_data_from_dir(readme_directory)\n", "print(\"Mean wordcount: \", mean(wordcounts))\n", "print(\"Median wordcount: \", median(wordcounts))\n", "print(\"Mean wordlength: \", mean(wordlengths))\n", "print(\"Median wordlength: \", median(wordlengths))\n", "lemmatized_corpus = preprocess(listed_corpus)" ] }, { "cell_type": "code", "execution_count": null, "id": "e90e236f-8db5-40cc-88a3-60e674b9d1de", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['020725_README_vectorizer.joblib']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "'''\n", "vectorizer = CountVectorizer(analyzer='word', \n", " min_df=2, \n", " stop_words='english', \n", " lowercase=True, \n", " token_pattern='[a-zA-Z0-9]{2,}', \n", " )\n", "data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n", "joblib.dump(vectorizer, '020725_README_vectorizer.joblib')\n", "'''\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "d68aaf7b", "metadata": {}, "outputs": [], "source": [ "vectorizer = joblib.load('020725_README_vectorizer.joblib')\n", "data_vectorized = vectorizer.transform(lemmatized_corpus) " ] }, { "cell_type": "code", "execution_count": 21, "id": "dd1a70c2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 6 candidates, totalling 30 fits\n", "[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n", "[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1158862.039 total time= 17.6s\n", "[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n", "[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1121276.805 total time= 12.0s\n", "[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n", "[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1058330.478 total time= 12.6s\n", "[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n", "[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1169073.807 total time= 12.7s\n", "[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n", "[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1308701.275 total time= 11.9s\n", "[CV 1/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n", "[CV 1/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1157991.152 total time= 11.8s\n", "[CV 2/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n", "[CV 2/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1120570.803 total time= 11.7s\n", "[CV 3/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n", "[CV 3/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1055699.316 total time= 12.4s\n", "[CV 4/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n", "[CV 4/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1168297.207 total time= 11.7s\n", "[CV 5/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n", "[CV 5/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1307949.520 total time= 12.4s\n", "[CV 1/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n", "[CV 1/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1157830.351 total time= 11.7s\n", "[CV 2/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n", "[CV 2/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1124221.589 total time= 11.8s\n", "[CV 3/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n", "[CV 3/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1056916.516 total time= 12.0s\n", "[CV 4/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n", "[CV 4/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1169168.331 total time= 12.9s\n", "[CV 5/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n", "[CV 5/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1308175.234 total time= 12.5s\n", "[CV 1/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n", "[CV 1/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1158540.475 total time= 12.2s\n", "[CV 2/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n", "[CV 2/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1120071.919 total time= 11.8s\n", "[CV 3/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n", "[CV 3/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1061777.082 total time= 12.3s\n", "[CV 4/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n", "[CV 4/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1170380.631 total time= 11.6s\n", "[CV 5/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n", "[CV 5/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1307034.410 total time= 11.6s\n", "[CV 1/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n", "[CV 1/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1156265.357 total time= 13.2s\n", "[CV 2/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n", "[CV 2/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1121786.140 total time= 11.8s\n", "[CV 3/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n", "[CV 3/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1060299.402 total time= 12.5s\n", "[CV 4/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n", "[CV 4/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1174913.458 total time= 12.1s\n", "[CV 5/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n", "[CV 5/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1305421.859 total time= 11.6s\n", "[CV 1/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n", "[CV 1/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1158280.857 total time= 12.0s\n", "[CV 2/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n", "[CV 2/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1117369.387 total time= 11.5s\n", "[CV 3/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n", "[CV 3/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1058230.568 total time= 12.0s\n", "[CV 4/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n", "[CV 4/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1171980.166 total time= 11.9s\n", "[CV 5/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n", "[CV 5/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1305822.528 total time= 11.3s\n", "Best Model's Params: {'batch_size': 128, 'learning_decay': 0.7, 'n_components': 9}\n", "Best Log Likelihood Score: -1162101.5996668166\n", "Model Perplexity: 2176.5064559983784\n" ] } ], "source": [ "lda_model_identification(data_vectorized)" ] }, { "cell_type": "code", "execution_count": 23, "id": "aa83d20f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Topic: 0\n", " image, data, key, file, color, option, support, format, default, mode\n", "Topic: 1\n", " data, test, library, object, implementation, support, packet, used, byte, class\n", "Topic: 2\n", " license, copyright, perl, gnu, free, version, module, public, general, warranty\n", "Topic: 3\n", " test, value, function, return, method, class, string, type, object, example\n", "Topic: 4\n", " http, git, server, install, client, request, test, version, project, command\n", "Topic: 5\n", " json, node, require, string, parser, var, object, parse, function, font\n", "Topic: 6\n", " command, output, option, process, make, program, script, tool, file, linux\n", "Topic: 7\n", " table, html, tag, text, django, xml, example, path, template, default\n", "Topic: 8\n", " install, make, build, library, version, directory, file, package, window, project\n" ] } ], "source": [ "topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": 33, "id": "f4345bd6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{0: [0.9963399069190733, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes'], 1: [0.9987558745140913, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README'], 2: [0.999271074201955, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt'], 3: [0.9966940236237574, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md'], 4: [0.9962628678061417, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst'], 5: [0.998166117886522, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst'], 6: [0.9670683884278027, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md'], 7: [0.9996764637160757, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md'], 8: [0.9976094391943828, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX']}\n" ] }, { "data": { "text/plain": [ "{0: [0.9963399069190733,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes'],\n", " 1: [0.9987558745140913,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README'],\n", " 2: [0.999271074201955,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt'],\n", " 3: [0.9966940236237574,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md'],\n", " 4: [0.9962628678061417,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst'],\n", " 5: [0.998166117886522,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst'],\n", " 6: [0.9670683884278027,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md'],\n", " 7: [0.9996764637160757,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md'],\n", " 8: [0.9976094391943828,\n", " '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX']}" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_most_prevalent(data_vectorized, file_list)" ] }, { "cell_type": "code", "execution_count": 36, "id": "23468e82", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "183\n", "-----------------------Topic 0 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "3142 0.996340 0.000457 0.000458 0.000458 0.000458 0.000457 0.000457 \n", "810 0.995085 0.000614 0.000614 0.000614 0.000614 0.000614 0.000614 \n", "3064 0.983533 0.002058 0.002058 0.002059 0.002059 0.002058 0.002059 \n", "2980 0.960597 0.000512 0.035817 0.000512 0.000513 0.000512 0.000512 \n", "197 0.892184 0.000950 0.000951 0.000950 0.000950 0.101164 0.000950 \n", "131 0.867562 0.001765 0.001764 0.036755 0.001764 0.001765 0.001765 \n", "3694 0.864345 0.001390 0.001390 0.001389 0.001390 0.049473 0.001390 \n", "582 0.857786 0.000105 0.064223 0.043669 0.000105 0.000105 0.000105 \n", "3026 0.851801 0.018529 0.018519 0.018536 0.018522 0.018532 0.018522 \n", "1647 0.851778 0.018530 0.018519 0.018530 0.018528 0.018532 0.018540 \n", "\n", " 7 8 \n", "3142 0.000458 0.000458 \n", "810 0.000614 0.000615 \n", "3064 0.002059 0.002058 \n", "2980 0.000512 0.000512 \n", "197 0.000950 0.000950 \n", "131 0.085096 0.001765 \n", "3694 0.001389 0.077843 \n", "582 0.033796 0.000105 \n", "3026 0.018521 0.018519 \n", "1647 0.018521 0.018523 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2362_plasma_breeze-plymouth.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1356_katomic.git_README.levels\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/541_dunst-project_dunst_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2497_borntyping_python-colorlog.git_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2421_whipper-team_whipper.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3297_takaswie_hinawa-utils.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/697_tiwai_awesfx.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2831_kilobyte_pmemkv_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2095_warner_magic-wormhole.git_README.md\n", " 0 1 2 3 4 5 6 \\\n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n", "37 0.000045 0.164319 0.053585 0.504039 0.040306 0.006119 0.171102 \n", "2913 0.000048 0.195954 0.000048 0.019170 0.723177 0.003377 0.058128 \n", "1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n", "802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n", "3287 0.000070 0.000070 0.007213 0.190779 0.000070 0.028435 0.027117 \n", "455 0.000071 0.481662 0.000071 0.030261 0.000071 0.000071 0.336087 \n", "1313 0.000076 0.047031 0.000076 0.097269 0.012519 0.034803 0.056043 \n", "1598 0.000079 0.000079 0.000079 0.542049 0.000079 0.000079 0.000079 \n", "\n", " 7 8 \n", "3251 0.999676 0.000040 \n", "337 0.000043 0.000043 \n", "37 0.005875 0.054610 \n", "2913 0.000048 0.000048 \n", "1149 0.000052 0.000052 \n", "802 0.000055 0.000055 \n", "3287 0.746179 0.000070 \n", "455 0.000071 0.151635 \n", "1313 0.104692 0.647491 \n", "1598 0.000079 0.457399 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/722_zeromq_czmq.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2774_timlegge_perl-XML-Generator_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3509_jirka-h_haveged_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/511_mmottl_ocaml-makefile.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4029_yayahjb_cqrlib.git_README_CQRlib.html\n", "-----------------------Topic 1 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "328 0.000156 0.998756 0.000155 0.000156 0.000156 0.000156 0.000156 \n", "2081 0.000601 0.995192 0.000601 0.000601 0.000601 0.000601 0.000601 \n", "2672 0.000862 0.993105 0.000862 0.000862 0.000862 0.000862 0.000862 \n", "389 0.004276 0.965797 0.004276 0.004275 0.004274 0.004275 0.004277 \n", "3087 0.005558 0.955534 0.005557 0.005556 0.005560 0.005558 0.005559 \n", "2478 0.006538 0.947694 0.006537 0.006539 0.006538 0.006538 0.006538 \n", "2272 0.006955 0.944406 0.006946 0.006946 0.006950 0.006947 0.006950 \n", "853 0.000350 0.932979 0.000350 0.000350 0.000350 0.000350 0.000350 \n", "1081 0.000567 0.931828 0.000567 0.064202 0.000567 0.000567 0.000567 \n", "1443 0.008555 0.931588 0.008548 0.008556 0.008549 0.008553 0.008550 \n", "\n", " 7 8 \n", "328 0.000156 0.000155 \n", "2081 0.000601 0.000601 \n", "2672 0.000862 0.000862 \n", "389 0.004275 0.004275 \n", "3087 0.005558 0.005560 \n", "2478 0.006537 0.006539 \n", "2272 0.006951 0.006949 \n", "853 0.000350 0.064574 \n", "1081 0.000567 0.000567 \n", "1443 0.008550 0.008551 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2517_cleder_fastkml_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2967_mila-iqia_picklable-itertools_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2575_samtools_htsjdk.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/543_boto_s3transfer_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2308_php-fig_cache.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2772_storaged-project_libblockdev_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/304_pauldmccarthy_indexed_gzip.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1282_python-lz4_python-lz4.git_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2289_carlodefalco_octave-mpi_README.md\n", " 0 1 2 3 4 5 6 \\\n", "3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n", "3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n", "3027 0.042031 0.000039 0.110408 0.114454 0.000039 0.015444 0.000039 \n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n", "964 0.089514 0.000043 0.006234 0.000043 0.000043 0.071786 0.043128 \n", "2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n", "3072 0.582368 0.000065 0.019226 0.039125 0.000065 0.041239 0.209392 \n", "661 0.300668 0.000066 0.010081 0.616868 0.000066 0.072053 0.000066 \n", "3221 0.221286 0.000069 0.040534 0.620289 0.000069 0.000069 0.000069 \n", "\n", " 7 8 \n", "3795 0.131447 0.000015 \n", "3966 0.000035 0.295755 \n", "3027 0.717508 0.000039 \n", "3251 0.999676 0.000040 \n", "337 0.000043 0.000043 \n", "964 0.000043 0.789164 \n", "2865 0.000047 0.452085 \n", "3072 0.069255 0.039265 \n", "661 0.000066 0.000066 \n", "3221 0.117548 0.000069 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2056_OpenPrinting_foomatic-db.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4206_dnmfarrell_Data-FormValidator.git_README\n", "-----------------------Topic 2 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "672 0.000091 0.000091 0.999271 0.000091 0.000091 0.000091 0.000091 \n", "3181 0.000152 0.000152 0.998782 0.000152 0.000152 0.000152 0.000152 \n", "3929 0.000966 0.000967 0.992265 0.000967 0.000967 0.000967 0.000967 \n", "1475 0.001039 0.001039 0.991690 0.001039 0.001039 0.001039 0.001039 \n", "1309 0.001059 0.001059 0.991530 0.001059 0.001059 0.001059 0.001059 \n", "4022 0.001079 0.001079 0.991367 0.001079 0.001079 0.001079 0.001079 \n", "3212 0.001112 0.001112 0.991106 0.001112 0.001112 0.001112 0.001112 \n", "2521 0.001123 0.001124 0.991016 0.001123 0.001123 0.001123 0.001123 \n", "2024 0.001134 0.001134 0.990927 0.001134 0.001135 0.001134 0.001134 \n", "2147 0.001146 0.001146 0.990834 0.001146 0.001146 0.001146 0.001146 \n", "\n", " 7 8 \n", "672 0.000091 0.000091 \n", "3181 0.000152 0.000152 \n", "3929 0.000967 0.000967 \n", "1475 0.001039 0.001039 \n", "1309 0.001059 0.001059 \n", "4022 0.001079 0.001079 \n", "3212 0.001112 0.001112 \n", "2521 0.001123 0.001123 \n", "2024 0.001134 0.001134 \n", "2147 0.001146 0.001146 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/666_RobertBeckebans_RBDOOM-3-BFG_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1439_pdfminer_pdfminer.six.git_README.html\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1191_backuppc_backuppc-xs.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3536_jkeenan_extutils-modulemaker.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/898_adrianlopezroche_fdupes.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/767_knik0_faad2.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3331_ClusterLabs_fence-agents_README.licence\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/139_seattlerb_ruby_parser.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2854_faye_faye_README.txt\n", " 0 1 2 3 4 5 6 \\\n", "3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n", "3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n", "3648 0.540425 0.023001 0.000038 0.245106 0.034566 0.000038 0.000038 \n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n", "2913 0.000048 0.195954 0.000048 0.019170 0.723177 0.003377 0.058128 \n", "1537 0.313891 0.089990 0.000049 0.023406 0.134488 0.000049 0.047071 \n", "1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n", "802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n", "10 0.000851 0.259910 0.000064 0.008255 0.000064 0.000845 0.723237 \n", "\n", " 7 8 \n", "3795 0.131447 0.000015 \n", "3966 0.000035 0.295755 \n", "3648 0.000038 0.156749 \n", "3251 0.999676 0.000040 \n", "2865 0.000047 0.452085 \n", "2913 0.000048 0.000048 \n", "1537 0.000049 0.391008 \n", "1149 0.000052 0.000052 \n", "802 0.000055 0.000055 \n", "10 0.006711 0.000064 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3091_AFLplusplus_AFLplusplus.git_README\n", "-----------------------Topic 3 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "4184 0.000413 0.000413 0.000413 0.996694 0.000413 0.000413 0.000413 \n", "634 0.000585 0.000585 0.000585 0.995319 0.000585 0.000585 0.000585 \n", "541 0.001059 0.001059 0.001059 0.991531 0.001058 0.001059 0.001059 \n", "4086 0.001069 0.001069 0.001069 0.991448 0.001069 0.001069 0.001069 \n", "337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n", "1082 0.000108 0.000108 0.038632 0.960609 0.000108 0.000108 0.000108 \n", "3436 0.000214 0.000214 0.000214 0.953378 0.000214 0.045126 0.000214 \n", "3099 0.000483 0.000483 0.000483 0.945309 0.030232 0.000483 0.000483 \n", "168 0.000204 0.000204 0.016534 0.944392 0.037848 0.000204 0.000204 \n", "805 0.000420 0.000419 0.000420 0.939900 0.057163 0.000420 0.000419 \n", "\n", " 7 8 \n", "4184 0.000413 0.000413 \n", "634 0.000585 0.000585 \n", "541 0.001058 0.001059 \n", "4086 0.001069 0.001069 \n", "337 0.000043 0.000043 \n", "1082 0.000108 0.000108 \n", "3436 0.000214 0.000214 \n", "3099 0.000483 0.021559 \n", "168 0.000204 0.000204 \n", "805 0.000419 0.000420 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3251_eproxus_meck.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1044_wolever_parameterized_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2204_easystats_parameters.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4060_perl5-utils_Params-Util_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/629_hamcrest_PyHamcrest_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4064_testing-cabal_mock_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3817_webmozart_assert_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1432_fluxx_exam_README.rst\n", " 0 1 2 3 4 5 6 \\\n", "3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n", "3355 0.044710 0.035340 0.279984 0.000040 0.000040 0.019913 0.122916 \n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "964 0.089514 0.000043 0.006234 0.000043 0.000043 0.071786 0.043128 \n", "2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n", "3870 0.312524 0.017558 0.065654 0.000051 0.032472 0.000051 0.098971 \n", "3366 0.196719 0.041714 0.043868 0.000061 0.309832 0.000061 0.306935 \n", "1239 0.133374 0.866158 0.000067 0.000067 0.000067 0.000067 0.000067 \n", "2205 0.306048 0.030212 0.382652 0.000073 0.018162 0.071704 0.032824 \n", "1183 0.281059 0.000077 0.000077 0.000077 0.000077 0.007434 0.000077 \n", "\n", " 7 8 \n", "3966 0.000035 0.295755 \n", "3355 0.333160 0.163897 \n", "3251 0.999676 0.000040 \n", "964 0.000043 0.789164 \n", "2865 0.000047 0.452085 \n", "3870 0.455920 0.016799 \n", "3366 0.000061 0.100750 \n", "1239 0.000067 0.000067 \n", "2205 0.045686 0.112640 \n", "1183 0.000077 0.711047 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/934_AlDanial_cloc.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1926_darold_ora2pg.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1611_arno-iptables-firewall_aif.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/554_memtest86plus_memtest86plus.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2225_JamesHeinrich_getID3_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/362_mate-desktop_marco.git_README\n", "-----------------------Topic 4 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "331 0.000467 0.000467 0.000467 0.000467 0.996263 0.000467 0.000467 \n", "3569 0.000601 0.000601 0.000601 0.000601 0.995193 0.000601 0.000601 \n", "2222 0.000717 0.000717 0.000717 0.000717 0.994262 0.000717 0.000717 \n", "3364 0.000761 0.000761 0.000762 0.000762 0.993908 0.000761 0.000761 \n", "659 0.000806 0.000805 0.000805 0.000807 0.993555 0.000806 0.000805 \n", "4125 0.000849 0.000849 0.000848 0.000849 0.993211 0.000849 0.000849 \n", "3191 0.000975 0.000976 0.000975 0.000975 0.992198 0.000975 0.000975 \n", "1342 0.000992 0.000992 0.000993 0.000992 0.992061 0.000992 0.000992 \n", "3762 0.001090 0.001090 0.001089 0.001089 0.991284 0.001089 0.001089 \n", "1350 0.001135 0.001135 0.001134 0.001135 0.990924 0.001135 0.001135 \n", "\n", " 7 8 \n", "331 0.000467 0.000467 \n", "3569 0.000601 0.000601 \n", "2222 0.000717 0.000717 \n", "3364 0.000762 0.000761 \n", "659 0.000806 0.000806 \n", "4125 0.000849 0.000848 \n", "3191 0.000975 0.000975 \n", "1342 0.000992 0.000992 \n", "3762 0.001089 0.001090 \n", "1350 0.001134 0.001135 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3145_tkem_mopidy-dleyna.git_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2600_spyder-ide_qtpy_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/953_rroemhild_flask-ldapconn_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3185_mopidy_mopidy-scrobbler.git_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2310_tduehr_omniauth-cas3_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2822_gawel_panoramisk.git_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3648_tkem_mopidy-podcast-itunes_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3969_kingosticks_mopidy-tunein_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3103_erdewit_nest_asyncio_README.rst\n", " 0 1 2 3 4 5 6 \\\n", "3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n", "3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n", "3027 0.042031 0.000039 0.110408 0.114454 0.000039 0.015444 0.000039 \n", "3355 0.044710 0.035340 0.279984 0.000040 0.000040 0.019913 0.122916 \n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n", "964 0.089514 0.000043 0.006234 0.000043 0.000043 0.071786 0.043128 \n", "2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n", "1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n", "802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n", "\n", " 7 8 \n", "3795 0.131447 0.000015 \n", "3966 0.000035 0.295755 \n", "3027 0.717508 0.000039 \n", "3355 0.333160 0.163897 \n", "3251 0.999676 0.000040 \n", "337 0.000043 0.000043 \n", "964 0.000043 0.789164 \n", "2865 0.000047 0.452085 \n", "1149 0.000052 0.000052 \n", "802 0.000055 0.000055 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/934_AlDanial_cloc.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n", "-----------------------Topic 5 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "3370 0.000229 0.000229 0.000229 0.000229 0.000229 0.998166 0.000229 \n", "1017 0.000333 0.000333 0.000333 0.000333 0.000333 0.997338 0.000333 \n", "3174 0.000423 0.000423 0.000423 0.000423 0.000423 0.996619 0.000423 \n", "717 0.001710 0.001711 0.001710 0.001710 0.001711 0.986318 0.001710 \n", "4132 0.002268 0.002269 0.002268 0.002269 0.002269 0.981849 0.002269 \n", "3 0.002778 0.002779 0.002779 0.002780 0.002779 0.977767 0.002779 \n", "1555 0.002925 0.002925 0.002924 0.002925 0.002925 0.976601 0.002926 \n", "1336 0.003969 0.003970 0.003969 0.003972 0.003970 0.968244 0.003969 \n", "705 0.005565 0.005560 0.005561 0.005569 0.005559 0.955511 0.005559 \n", "2220 0.006536 0.006537 0.006541 0.006540 0.006537 0.947699 0.006536 \n", "\n", " 7 8 \n", "3370 0.000229 0.000229 \n", "1017 0.000333 0.000333 \n", "3174 0.000423 0.000423 \n", "717 0.001710 0.001710 \n", "4132 0.002269 0.002269 \n", "3 0.002779 0.002780 \n", "1555 0.002925 0.002925 \n", "1336 0.003969 0.003969 \n", "705 0.005558 0.005557 \n", "2220 0.006537 0.006536 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2594_crsmithdev_arrow.git_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2879_jonschlinkert_map-visit_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1625_keis_base58_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/162_sebastianbergmann_comparator_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1075_sebastianbergmann_object-enumerator_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/830_npm_abbrev-js_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1722_unclechu_node-deep-extend.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2412_crosswire-bible-society_nave_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4251_hughsk_is-typedarray_README.md\n", " 0 1 2 3 4 5 6 \\\n", "3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n", "3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n", "3648 0.540425 0.023001 0.000038 0.245106 0.034566 0.000038 0.000038 \n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n", "2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n", "1537 0.313891 0.089990 0.000049 0.023406 0.134488 0.000049 0.047071 \n", "3870 0.312524 0.017558 0.065654 0.000051 0.032472 0.000051 0.098971 \n", "1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n", "802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n", "\n", " 7 8 \n", "3795 0.131447 0.000015 \n", "3966 0.000035 0.295755 \n", "3648 0.000038 0.156749 \n", "3251 0.999676 0.000040 \n", "337 0.000043 0.000043 \n", "2865 0.000047 0.452085 \n", "1537 0.000049 0.391008 \n", "3870 0.455920 0.016799 \n", "1149 0.000052 0.000052 \n", "802 0.000055 0.000055 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1926_darold_ora2pg.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n", "-----------------------Topic 6 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "1893 0.004116 0.004116 0.004116 0.004116 0.004119 0.004116 0.967068 \n", "2605 0.000456 0.000456 0.000456 0.000456 0.000456 0.031675 0.965135 \n", "2308 0.004630 0.004630 0.004630 0.004630 0.004634 0.004630 0.962953 \n", "2446 0.004630 0.004630 0.004630 0.004630 0.004634 0.004630 0.962953 \n", "4152 0.004630 0.004630 0.004630 0.004630 0.004634 0.004630 0.962953 \n", "371 0.005053 0.005053 0.005053 0.005051 0.005053 0.005052 0.959573 \n", "1589 0.006946 0.006946 0.006947 0.006945 0.006947 0.006945 0.944423 \n", "3111 0.006950 0.006946 0.006947 0.006950 0.006950 0.006950 0.944408 \n", "3995 0.056876 0.000100 0.000100 0.000100 0.000100 0.000100 0.942423 \n", "3135 0.007414 0.007415 0.007409 0.007414 0.007411 0.007411 0.940706 \n", "\n", " 7 8 \n", "1893 0.004116 0.004117 \n", "2605 0.000456 0.000456 \n", "2308 0.004630 0.004631 \n", "2446 0.004630 0.004631 \n", "4152 0.004630 0.004631 \n", "371 0.005052 0.005059 \n", "1589 0.006949 0.006953 \n", "3111 0.006952 0.006948 \n", "3995 0.000100 0.000100 \n", "3135 0.007410 0.007411 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1607_ppentchev_feature-check_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2577_kodi-pvr_pvr.dvbviewer.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2363_kodi-pvr_pvr.hdhomerun.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3414_kodi-pvr_pvr.njoy.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3527_hackerschoice_THC-Archive_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2455_tcolar_wmfrog_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2103_aperezdc_signify.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/119_linux-thinkpad_tp_smapi_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3587_igaw_jitterdebugger.git_README\n", " 0 1 2 3 4 5 6 \\\n", "3597 0.520753 0.030707 0.076996 0.009554 0.064448 0.004423 0.000021 \n", "3648 0.540425 0.023001 0.000038 0.245106 0.034566 0.000038 0.000038 \n", "3027 0.042031 0.000039 0.110408 0.114454 0.000039 0.015444 0.000039 \n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n", "1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n", "802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n", "4047 0.672539 0.165141 0.000064 0.028149 0.043231 0.000064 0.000064 \n", "661 0.300668 0.000066 0.010081 0.616868 0.000066 0.072053 0.000066 \n", "1239 0.133374 0.866158 0.000067 0.000067 0.000067 0.000067 0.000067 \n", "\n", " 7 8 \n", "3597 0.000021 0.293076 \n", "3648 0.000038 0.156749 \n", "3027 0.717508 0.000039 \n", "3251 0.999676 0.000040 \n", "337 0.000043 0.000043 \n", "1149 0.000052 0.000052 \n", "802 0.000055 0.000055 \n", "4047 0.090683 0.000064 \n", "661 0.000066 0.000066 \n", "1239 0.000067 0.000067 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3624_audacity_audacity.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2523_jeancroy_fuzzaldrin-plus_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/554_memtest86plus_memtest86plus.git_README.md\n", "-----------------------Topic 7 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "2445 0.000363 0.000363 0.000363 0.000363 0.000363 0.000363 0.000363 \n", "1971 0.001001 0.001002 0.001001 0.001002 0.001001 0.001002 0.001001 \n", "2288 0.001635 0.001635 0.001634 0.001635 0.001635 0.001636 0.001635 \n", "52 0.002138 0.002139 0.002138 0.002140 0.002138 0.002138 0.002138 \n", "2276 0.023188 0.000155 0.000155 0.000155 0.000155 0.000155 0.000155 \n", "1682 0.000772 0.000772 0.000772 0.000772 0.039993 0.000772 0.000772 \n", "3082 0.009264 0.009261 0.009262 0.009265 0.009266 0.009262 0.009269 \n", "1117 0.084798 0.000643 0.000642 0.000643 0.000643 0.000642 0.000643 \n", "2480 0.000185 0.000185 0.000185 0.000185 0.000185 0.000185 0.000185 \n", "\n", " 7 8 \n", "3251 0.999676 0.000040 \n", "2445 0.997094 0.000363 \n", "1971 0.991989 0.001001 \n", "2288 0.986920 0.001635 \n", "52 0.982894 0.002138 \n", "2276 0.975728 0.000155 \n", "1682 0.954604 0.000772 \n", "3082 0.925882 0.009269 \n", "1117 0.910704 0.000642 \n", "2480 0.909884 0.088824 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3104_scrapy-plugins_scrapy-djangoitem_README.rst\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1445_carljm_django-model-utils.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/864_bfirsh_django-ordered-model.git_README.markdown\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3002_ionelmc_python-darkslide.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4196_alexott_muse_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/995_coleifer_wtf-peewee_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/811_sopel-irc_sopel.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1019_jazzband_django-sortedm2m.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/356_wanderlust_apel_README.en\n", " 0 1 2 3 4 5 6 \\\n", "3597 0.520753 0.030707 0.076996 0.009554 0.064448 0.004423 0.000021 \n", "3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n", "3648 0.540425 0.023001 0.000038 0.245106 0.034566 0.000038 0.000038 \n", "337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n", "964 0.089514 0.000043 0.006234 0.000043 0.000043 0.071786 0.043128 \n", "2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n", "2913 0.000048 0.195954 0.000048 0.019170 0.723177 0.003377 0.058128 \n", "1537 0.313891 0.089990 0.000049 0.023406 0.134488 0.000049 0.047071 \n", "1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n", "802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n", "\n", " 7 8 \n", "3597 0.000021 0.293076 \n", "3966 0.000035 0.295755 \n", "3648 0.000038 0.156749 \n", "337 0.000043 0.000043 \n", "964 0.000043 0.789164 \n", "2865 0.000047 0.452085 \n", "2913 0.000048 0.000048 \n", "1537 0.000049 0.391008 \n", "1149 0.000052 0.000052 \n", "802 0.000055 0.000055 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3624_audacity_audacity.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n", "-----------------------Topic 8 --------------------------------\n", " 0 1 2 3 4 5 6 \\\n", "140 0.000299 0.000299 0.000299 0.000299 0.000299 0.000299 0.000299 \n", "444 0.000475 0.000475 0.000475 0.000475 0.000475 0.000475 0.000475 \n", "124 0.000517 0.000517 0.000517 0.000517 0.000517 0.000517 0.000517 \n", "2319 0.000559 0.000559 0.000559 0.000559 0.000559 0.000559 0.000559 \n", "3911 0.000564 0.000564 0.000564 0.000564 0.000564 0.000564 0.000564 \n", "2132 0.000621 0.000621 0.000621 0.000621 0.000621 0.000621 0.000621 \n", "75 0.000635 0.000635 0.000635 0.000635 0.000635 0.000635 0.000635 \n", "529 0.000635 0.000635 0.000635 0.000635 0.000635 0.000635 0.000635 \n", "2202 0.000642 0.000642 0.000642 0.000643 0.000642 0.000643 0.000643 \n", "3084 0.000642 0.000642 0.000642 0.000643 0.000642 0.000643 0.000643 \n", "\n", " 7 8 \n", "140 0.000299 0.997609 \n", "444 0.000475 0.996200 \n", "124 0.000517 0.995864 \n", "2319 0.000559 0.995531 \n", "3911 0.000565 0.995485 \n", "2132 0.000621 0.995032 \n", "75 0.000635 0.994919 \n", "529 0.000635 0.994919 \n", "2202 0.000643 0.994860 \n", "3084 0.000643 0.994860 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3202_extensions_dune-grid-glue_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2977_airspy_airspyone_host.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2136_wxMaxima-developers_wxmaxima_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2068_math-comp_math-comp_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2814_MaartenBaert_ssr_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2954_core_dune-localfunctions_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1140_core_dune-geometry_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/305_staging_dune-typetree_README.GIT\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4205_staging_dune-functions_README\n", " 0 1 2 3 4 5 6 \\\n", "3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n", "3027 0.042031 0.000039 0.110408 0.114454 0.000039 0.015444 0.000039 \n", "3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n", "337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n", "2913 0.000048 0.195954 0.000048 0.019170 0.723177 0.003377 0.058128 \n", "1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n", "802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n", "10 0.000851 0.259910 0.000064 0.008255 0.000064 0.000845 0.723237 \n", "4047 0.672539 0.165141 0.000064 0.028149 0.043231 0.000064 0.000064 \n", "661 0.300668 0.000066 0.010081 0.616868 0.000066 0.072053 0.000066 \n", "\n", " 7 8 \n", "3795 0.131447 0.000015 \n", "3027 0.717508 0.000039 \n", "3251 0.999676 0.000040 \n", "337 0.000043 0.000043 \n", "2913 0.000048 0.000048 \n", "1149 0.000052 0.000052 \n", "802 0.000055 0.000055 \n", "10 0.006711 0.000064 \n", "4047 0.090683 0.000064 \n", "661 0.000066 0.000066 \n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3091_AFLplusplus_AFLplusplus.git_README\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2523_jeancroy_fuzzaldrin-plus_README.md\n", "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n", "0 0.073153\n", "1 0.097928\n", "2 0.096995\n", "3 0.076255\n", "4 0.141707\n", "5 0.072689\n", "6 0.093970\n", "7 0.072439\n", "8 0.274864\n", "dtype: float64\n" ] } ], "source": [ "prevalent_topics(data_vectorized, file_list)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }