{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e09a84d6-cbd4-4a12-8e96-3775f734a262",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import glob\n",
    "import copy\n",
    "import csv\n",
    "from statistics import mean, median\n",
    "from strip_markdown import strip_markdown\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9483091c-ac72-415c-932d-ac7cf7970789",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim\n",
    "import gensim.corpora as corpora\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.models import CoherenceModel\n",
    "from gensim.models.phrases import Phrases\n",
    "\n",
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "\n",
    "from statistics import mode\n",
    "\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "196abd6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#import nltk\n",
    "#nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3da6b590-875d-478d-aaaa-de020039c519",
   "metadata": {},
   "outputs": [],
   "source": [
    "# spacy and nltk for lemmatization\n",
    "import nltk \n",
    "#nltk.download('stopwords')\n",
    "import spacy\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "\n",
    "stopwords = stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "60c137ae-6fe9-4b03-b899-6141b1645d6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def metadata_for_file(file):\n",
    "    word_list = file.split()\n",
    "    word_count = len(word_list)\n",
    "    #print(word_list)\n",
    "    if word_count == 0:\n",
    "        avg_word_length = 0\n",
    "    else: \n",
    "        avg_word_length = sum(map(len, word_list))  / len(word_list)\n",
    "    #return number of paragraphs\n",
    "    return word_count, avg_word_length, word_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "2e674fef-adb4-48c9-86a0-a655c41a95f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data_from_dir(directory):\n",
    "    files = glob.glob(f\"{directory}/*\")\n",
    "    data_list = []\n",
    "    word_counts = []\n",
    "    avg_word_lengths = []\n",
    "    file_list = []\n",
    "    files_word_lists = defaultdict(list)\n",
    "    for file in files:\n",
    "        text = open(file, encoding='utf-8', errors='ignore').read()\n",
    "        #here's some of the descriptive text analysis\n",
    "        word_count, avg_word_length, word_list = metadata_for_file(text)\n",
    "        word_counts.append(word_count)\n",
    "        avg_word_lengths.append(avg_word_length)\n",
    "        #adding the data to the list of text\n",
    "        if word_count > 0:\n",
    "            files_word_lists[tuple(word_list)].append(file)\n",
    "        data_list.append(text)\n",
    "        #adding filename\n",
    "        file_list.append(file)\n",
    "    return data_list, word_counts, avg_word_lengths, file_list, "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "2b332b10-bfc8-4566-8c52-19a8a334af00",
   "metadata": {},
   "outputs": [],
   "source": [
    "#preprocessing text data\n",
    "def preprocess(corpus_list):\n",
    "    #extending stopwords \n",
    "    specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n",
    "    \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n",
    "    stopwords.extend(specific_stopwords)\n",
    "    D = copy.copy(corpus_list)\n",
    "    #stripping markdown from documents\n",
    "    D = [strip_markdown(doc) for doc in D]\n",
    "    #strip html \n",
    "    D = [re.sub(r'<[^<]+?>', '', doc, flags=re.DOTALL) for doc in D]\n",
    "    #mvp right now, can certainly be expanded as iterations of text analysis are done\n",
    "    D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n",
    "    return D_lemma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#preparing processed data for model usage\n",
    "def text_preparation(lemmatized_text):\n",
    "    #bigrams\n",
    "    D_bigrams = copy.copy(lemmatized_text)\n",
    "    bigram = Phrases(D_bigrams, min_count=2)\n",
    "    for i in range(len(lemmatized_text)):\n",
    "        for token in bigram[D_bigrams[i]]:\n",
    "            if '_' in token:\n",
    "                D_bigrams[i].append(token)\n",
    "    #id2word\n",
    "    id2word = corpora.Dictionary(D_bigrams)\n",
    "    id2word.filter_extremes(no_below=5, no_above=0.5)\n",
    "    #bow representation \n",
    "    bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n",
    "    return bag_of_words, id2word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "24799e25-2c0c-4e16-b503-68296f604f52",
   "metadata": {},
   "outputs": [],
   "source": [
    "def lda_model_identification(data_vectorized):\n",
    "    lda = LatentDirichletAllocation()\n",
    "    search_params = {'n_components': [9], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256]  }\n",
    "    model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n",
    "    model.fit(data_vectorized)\n",
    "    best_lda_model = model.best_estimator_\n",
    "    print(\"Best Model's Params: \", model.best_params_)\n",
    "    print(\"Best Log Likelihood Score: \", model.best_score_)\n",
    "    print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def best_lda_model(data_vectorized, vocab):\n",
    "    lda = LatentDirichletAllocation(n_components=9, learning_decay = 0.7, batch_size = 128, max_iter = 50)\n",
    "    id_topic = lda.fit_transform(data_vectorized)\n",
    "    topic_words = {}\n",
    "    for topic, comp in enumerate(lda.components_):\n",
    "        word_idx = np.argsort(comp)[::-1][:10]\n",
    "        topic_words[topic] = [vocab[i] for i in word_idx]\n",
    "    for topic, words in topic_words.items():\n",
    "        print('Topic: %d' % topic)\n",
    "        print('  %s' % ', '.join(words))\n",
    "    #lda.print_topics(num_words=10)\n",
    "    joblib.dump(lda, '020325_README_lda.jl')\n",
    "    #lda = joblib.load('0509_lda.jl')\n",
    "    return id_topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_most_prevalent(vect_documents, documents):\n",
    "    lda = joblib.load('020725_README_lda.jl')\n",
    "    distributions = lda.transform(vect_documents)\n",
    "    most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"], 7: [0, \"\"], 8: [0, \"\"]}\n",
    "    for i, topic_distribution in enumerate(distributions):\n",
    "        for j in range(9):\n",
    "            if topic_distribution[j] > most_prevalent[j][0]:\n",
    "                most_prevalent[j] = [topic_distribution[j], documents[i]]\n",
    "    print(most_prevalent)\n",
    "    return most_prevalent\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec",
   "metadata": {},
   "outputs": [],
   "source": [
    "def prevalent_topics(vect_documents, file_list):\n",
    "    lda = joblib.load('020725_README_lda.jl')\n",
    "    #lda = joblib.load('0514_contrib_lda.jl')\n",
    "    distributions = lda.transform(vect_documents)\n",
    "    #figuring out what the max distribution is and then figuring out the mode\n",
    "    top_topic = []\n",
    "    count_of_multiple = 0\n",
    "    topic_arrays = []\n",
    "    for i, topic_distribution in enumerate(distributions):\n",
    "        max_dist = max(topic_distribution)\n",
    "        indexes = np.where(topic_distribution == max_dist)[0]\n",
    "        if len(indexes) == 1:\n",
    "            top_topic.append(indexes[0])\n",
    "        else:\n",
    "            count_of_multiple += 1\n",
    "        topic_arrays.append(topic_distribution)\n",
    "    #most_frequent(top_topic)\n",
    "    print(count_of_multiple)\n",
    "    df = pd.DataFrame(topic_arrays)\n",
    "    #finding the distribution values for all documents\n",
    "    with open('020725_README_file_topic_distributions.csv', 'w', newline='') as csvfile:\n",
    "        fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8']\n",
    "        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
    "        writer.writeheader()\n",
    "        for i, row in df.iterrows():\n",
    "            project_dir =  {}\n",
    "            project_dir['filename'] = file_list[i].split(\"/\")[-1]\n",
    "            array_row = df.iloc[i].to_numpy()\n",
    "            for j in range(9):\n",
    "                project_dir[\"t\" + str(j)] = array_row[j]\n",
    "            writer.writerow(project_dir)\n",
    "    #print(df.sort_values(by=['0']).head(5))\n",
    "    for i in range(9):\n",
    "        print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n",
    "        top5 = df.nlargest(10, i)\n",
    "        top_indices = top5.index.to_list()\n",
    "        print(top5)\n",
    "        for index in top_indices:\n",
    "            print(file_list[index])\n",
    "        bottom5 = df.nsmallest(10, i)\n",
    "        bottom_indices = bottom5.index.to_list()\n",
    "        print(bottom5)\n",
    "        for index in bottom_indices:\n",
    "            print(file_list[index])\n",
    "    averages = df.mean()\n",
    "    print(averages)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def most_frequent(topic_prevalence):\n",
    "    most_frequent_array = []\n",
    "    for j in range(11):\n",
    "        topic = mode(topic_prevalence)\n",
    "        most_frequent_array.append(topic)\n",
    "        topic_prevalence = [i for i in topic_prevalence if i != topic]\n",
    "    print(most_frequent_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "69d606fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "1f937c2e-2714-475d-b670-602164c46642",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean wordcount:  324.0929957406531\n",
      "Median wordcount:  156.0\n",
      "Mean wordlength:  6.354120246310486\n",
      "Median wordlength:  5.950514528900827\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/SOC.NORTHWESTERN.EDU/nws8519/anaconda3/lib/python3.12/html/parser.py:171: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n",
      "  k = self.parse_starttag(i)\n"
     ]
    }
   ],
   "source": [
    "listed_corpus, wordcounts, wordlengths, file_list= get_data_from_dir(readme_directory)\n",
    "print(\"Mean wordcount: \", mean(wordcounts))\n",
    "print(\"Median wordcount: \", median(wordcounts))\n",
    "print(\"Mean wordlength: \", mean(wordlengths))\n",
    "print(\"Median wordlength: \", median(wordlengths))\n",
    "lemmatized_corpus = preprocess(listed_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e90e236f-8db5-40cc-88a3-60e674b9d1de",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['020725_README_vectorizer.joblib']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "'''\n",
    "vectorizer = CountVectorizer(analyzer='word',       \n",
    "                         min_df=2,                        \n",
    "                         stop_words='english',             \n",
    "                         lowercase=True,                   \n",
    "                         token_pattern='[a-zA-Z0-9]{2,}',  \n",
    "                        )\n",
    "data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n",
    "joblib.dump(vectorizer, '020725_README_vectorizer.joblib')\n",
    "'''\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "d68aaf7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = joblib.load('020725_README_vectorizer.joblib')\n",
    "data_vectorized = vectorizer.transform(lemmatized_corpus)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "dd1a70c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
      "[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
      "[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1158862.039 total time=  17.6s\n",
      "[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
      "[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1121276.805 total time=  12.0s\n",
      "[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
      "[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1058330.478 total time=  12.6s\n",
      "[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
      "[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1169073.807 total time=  12.7s\n",
      "[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
      "[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1308701.275 total time=  11.9s\n",
      "[CV 1/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
      "[CV 1/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1157991.152 total time=  11.8s\n",
      "[CV 2/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
      "[CV 2/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1120570.803 total time=  11.7s\n",
      "[CV 3/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
      "[CV 3/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1055699.316 total time=  12.4s\n",
      "[CV 4/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
      "[CV 4/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1168297.207 total time=  11.7s\n",
      "[CV 5/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
      "[CV 5/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1307949.520 total time=  12.4s\n",
      "[CV 1/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
      "[CV 1/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1157830.351 total time=  11.7s\n",
      "[CV 2/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
      "[CV 2/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1124221.589 total time=  11.8s\n",
      "[CV 3/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
      "[CV 3/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1056916.516 total time=  12.0s\n",
      "[CV 4/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
      "[CV 4/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1169168.331 total time=  12.9s\n",
      "[CV 5/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
      "[CV 5/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1308175.234 total time=  12.5s\n",
      "[CV 1/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
      "[CV 1/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1158540.475 total time=  12.2s\n",
      "[CV 2/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
      "[CV 2/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1120071.919 total time=  11.8s\n",
      "[CV 3/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
      "[CV 3/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1061777.082 total time=  12.3s\n",
      "[CV 4/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
      "[CV 4/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1170380.631 total time=  11.6s\n",
      "[CV 5/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
      "[CV 5/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1307034.410 total time=  11.6s\n",
      "[CV 1/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
      "[CV 1/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1156265.357 total time=  13.2s\n",
      "[CV 2/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
      "[CV 2/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1121786.140 total time=  11.8s\n",
      "[CV 3/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
      "[CV 3/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1060299.402 total time=  12.5s\n",
      "[CV 4/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
      "[CV 4/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1174913.458 total time=  12.1s\n",
      "[CV 5/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
      "[CV 5/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1305421.859 total time=  11.6s\n",
      "[CV 1/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
      "[CV 1/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1158280.857 total time=  12.0s\n",
      "[CV 2/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
      "[CV 2/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1117369.387 total time=  11.5s\n",
      "[CV 3/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
      "[CV 3/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1058230.568 total time=  12.0s\n",
      "[CV 4/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
      "[CV 4/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1171980.166 total time=  11.9s\n",
      "[CV 5/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
      "[CV 5/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1305822.528 total time=  11.3s\n",
      "Best Model's Params:  {'batch_size': 128, 'learning_decay': 0.7, 'n_components': 9}\n",
      "Best Log Likelihood Score:  -1162101.5996668166\n",
      "Model Perplexity:  2176.5064559983784\n"
     ]
    }
   ],
   "source": [
    "lda_model_identification(data_vectorized)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "aa83d20f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic: 0\n",
      "  image, data, key, file, color, option, support, format, default, mode\n",
      "Topic: 1\n",
      "  data, test, library, object, implementation, support, packet, used, byte, class\n",
      "Topic: 2\n",
      "  license, copyright, perl, gnu, free, version, module, public, general, warranty\n",
      "Topic: 3\n",
      "  test, value, function, return, method, class, string, type, object, example\n",
      "Topic: 4\n",
      "  http, git, server, install, client, request, test, version, project, command\n",
      "Topic: 5\n",
      "  json, node, require, string, parser, var, object, parse, function, font\n",
      "Topic: 6\n",
      "  command, output, option, process, make, program, script, tool, file, linux\n",
      "Topic: 7\n",
      "  table, html, tag, text, django, xml, example, path, template, default\n",
      "Topic: 8\n",
      "  install, make, build, library, version, directory, file, package, window, project\n"
     ]
    }
   ],
   "source": [
    "topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "f4345bd6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: [0.9963399069190733, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes'], 1: [0.9987558745140913, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README'], 2: [0.999271074201955, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt'], 3: [0.9966940236237574, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md'], 4: [0.9962628678061417, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst'], 5: [0.998166117886522, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst'], 6: [0.9670683884278027, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md'], 7: [0.9996764637160757, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md'], 8: [0.9976094391943828, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX']}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{0: [0.9963399069190733,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes'],\n",
       " 1: [0.9987558745140913,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README'],\n",
       " 2: [0.999271074201955,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt'],\n",
       " 3: [0.9966940236237574,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md'],\n",
       " 4: [0.9962628678061417,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst'],\n",
       " 5: [0.998166117886522,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst'],\n",
       " 6: [0.9670683884278027,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md'],\n",
       " 7: [0.9996764637160757,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md'],\n",
       " 8: [0.9976094391943828,\n",
       "  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX']}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_most_prevalent(data_vectorized, file_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "23468e82",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "183\n",
      "-----------------------Topic 0 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3142  0.996340  0.000457  0.000458  0.000458  0.000458  0.000457  0.000457   \n",
      "810   0.995085  0.000614  0.000614  0.000614  0.000614  0.000614  0.000614   \n",
      "3064  0.983533  0.002058  0.002058  0.002059  0.002059  0.002058  0.002059   \n",
      "2980  0.960597  0.000512  0.035817  0.000512  0.000513  0.000512  0.000512   \n",
      "197   0.892184  0.000950  0.000951  0.000950  0.000950  0.101164  0.000950   \n",
      "131   0.867562  0.001765  0.001764  0.036755  0.001764  0.001765  0.001765   \n",
      "3694  0.864345  0.001390  0.001390  0.001389  0.001390  0.049473  0.001390   \n",
      "582   0.857786  0.000105  0.064223  0.043669  0.000105  0.000105  0.000105   \n",
      "3026  0.851801  0.018529  0.018519  0.018536  0.018522  0.018532  0.018522   \n",
      "1647  0.851778  0.018530  0.018519  0.018530  0.018528  0.018532  0.018540   \n",
      "\n",
      "             7         8  \n",
      "3142  0.000458  0.000458  \n",
      "810   0.000614  0.000615  \n",
      "3064  0.002059  0.002058  \n",
      "2980  0.000512  0.000512  \n",
      "197   0.000950  0.000950  \n",
      "131   0.085096  0.001765  \n",
      "3694  0.001389  0.077843  \n",
      "582   0.033796  0.000105  \n",
      "3026  0.018521  0.018519  \n",
      "1647  0.018521  0.018523  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2362_plasma_breeze-plymouth.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1356_katomic.git_README.levels\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/541_dunst-project_dunst_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2497_borntyping_python-colorlog.git_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2421_whipper-team_whipper.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3297_takaswie_hinawa-utils.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/697_tiwai_awesfx.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2831_kilobyte_pmemkv_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2095_warner_magic-wormhole.git_README.md\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "337   0.000043  0.000042  0.033554  0.966149  0.000042  0.000042  0.000043   \n",
      "37    0.000045  0.164319  0.053585  0.504039  0.040306  0.006119  0.171102   \n",
      "2913  0.000048  0.195954  0.000048  0.019170  0.723177  0.003377  0.058128   \n",
      "1149  0.000052  0.650270  0.000052  0.349365  0.000052  0.000052  0.000052   \n",
      "802   0.000055  0.850160  0.000055  0.149458  0.000055  0.000055  0.000055   \n",
      "3287  0.000070  0.000070  0.007213  0.190779  0.000070  0.028435  0.027117   \n",
      "455   0.000071  0.481662  0.000071  0.030261  0.000071  0.000071  0.336087   \n",
      "1313  0.000076  0.047031  0.000076  0.097269  0.012519  0.034803  0.056043   \n",
      "1598  0.000079  0.000079  0.000079  0.542049  0.000079  0.000079  0.000079   \n",
      "\n",
      "             7         8  \n",
      "3251  0.999676  0.000040  \n",
      "337   0.000043  0.000043  \n",
      "37    0.005875  0.054610  \n",
      "2913  0.000048  0.000048  \n",
      "1149  0.000052  0.000052  \n",
      "802   0.000055  0.000055  \n",
      "3287  0.746179  0.000070  \n",
      "455   0.000071  0.151635  \n",
      "1313  0.104692  0.647491  \n",
      "1598  0.000079  0.457399  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/722_zeromq_czmq.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2774_timlegge_perl-XML-Generator_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3509_jirka-h_haveged_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/511_mmottl_ocaml-makefile.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4029_yayahjb_cqrlib.git_README_CQRlib.html\n",
      "-----------------------Topic 1 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "328   0.000156  0.998756  0.000155  0.000156  0.000156  0.000156  0.000156   \n",
      "2081  0.000601  0.995192  0.000601  0.000601  0.000601  0.000601  0.000601   \n",
      "2672  0.000862  0.993105  0.000862  0.000862  0.000862  0.000862  0.000862   \n",
      "389   0.004276  0.965797  0.004276  0.004275  0.004274  0.004275  0.004277   \n",
      "3087  0.005558  0.955534  0.005557  0.005556  0.005560  0.005558  0.005559   \n",
      "2478  0.006538  0.947694  0.006537  0.006539  0.006538  0.006538  0.006538   \n",
      "2272  0.006955  0.944406  0.006946  0.006946  0.006950  0.006947  0.006950   \n",
      "853   0.000350  0.932979  0.000350  0.000350  0.000350  0.000350  0.000350   \n",
      "1081  0.000567  0.931828  0.000567  0.064202  0.000567  0.000567  0.000567   \n",
      "1443  0.008555  0.931588  0.008548  0.008556  0.008549  0.008553  0.008550   \n",
      "\n",
      "             7         8  \n",
      "328   0.000156  0.000155  \n",
      "2081  0.000601  0.000601  \n",
      "2672  0.000862  0.000862  \n",
      "389   0.004275  0.004275  \n",
      "3087  0.005558  0.005560  \n",
      "2478  0.006537  0.006539  \n",
      "2272  0.006951  0.006949  \n",
      "853   0.000350  0.064574  \n",
      "1081  0.000567  0.000567  \n",
      "1443  0.008550  0.008551  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2517_cleder_fastkml_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2967_mila-iqia_picklable-itertools_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2575_samtools_htsjdk.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/543_boto_s3transfer_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2308_php-fig_cache.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2772_storaged-project_libblockdev_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/304_pauldmccarthy_indexed_gzip.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1282_python-lz4_python-lz4.git_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2289_carlodefalco_octave-mpi_README.md\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3795  0.002233  0.000015  0.000015  0.248725  0.000015  0.000015  0.617519   \n",
      "3966  0.029730  0.000035  0.000035  0.000035  0.000035  0.000035  0.674307   \n",
      "3027  0.042031  0.000039  0.110408  0.114454  0.000039  0.015444  0.000039   \n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "337   0.000043  0.000042  0.033554  0.966149  0.000042  0.000042  0.000043   \n",
      "964   0.089514  0.000043  0.006234  0.000043  0.000043  0.071786  0.043128   \n",
      "2865  0.508832  0.000047  0.000047  0.000047  0.000047  0.000047  0.038802   \n",
      "3072  0.582368  0.000065  0.019226  0.039125  0.000065  0.041239  0.209392   \n",
      "661   0.300668  0.000066  0.010081  0.616868  0.000066  0.072053  0.000066   \n",
      "3221  0.221286  0.000069  0.040534  0.620289  0.000069  0.000069  0.000069   \n",
      "\n",
      "             7         8  \n",
      "3795  0.131447  0.000015  \n",
      "3966  0.000035  0.295755  \n",
      "3027  0.717508  0.000039  \n",
      "3251  0.999676  0.000040  \n",
      "337   0.000043  0.000043  \n",
      "964   0.000043  0.789164  \n",
      "2865  0.000047  0.452085  \n",
      "3072  0.069255  0.039265  \n",
      "661   0.000066  0.000066  \n",
      "3221  0.117548  0.000069  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2056_OpenPrinting_foomatic-db.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4206_dnmfarrell_Data-FormValidator.git_README\n",
      "-----------------------Topic 2 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "672   0.000091  0.000091  0.999271  0.000091  0.000091  0.000091  0.000091   \n",
      "3181  0.000152  0.000152  0.998782  0.000152  0.000152  0.000152  0.000152   \n",
      "3929  0.000966  0.000967  0.992265  0.000967  0.000967  0.000967  0.000967   \n",
      "1475  0.001039  0.001039  0.991690  0.001039  0.001039  0.001039  0.001039   \n",
      "1309  0.001059  0.001059  0.991530  0.001059  0.001059  0.001059  0.001059   \n",
      "4022  0.001079  0.001079  0.991367  0.001079  0.001079  0.001079  0.001079   \n",
      "3212  0.001112  0.001112  0.991106  0.001112  0.001112  0.001112  0.001112   \n",
      "2521  0.001123  0.001124  0.991016  0.001123  0.001123  0.001123  0.001123   \n",
      "2024  0.001134  0.001134  0.990927  0.001134  0.001135  0.001134  0.001134   \n",
      "2147  0.001146  0.001146  0.990834  0.001146  0.001146  0.001146  0.001146   \n",
      "\n",
      "             7         8  \n",
      "672   0.000091  0.000091  \n",
      "3181  0.000152  0.000152  \n",
      "3929  0.000967  0.000967  \n",
      "1475  0.001039  0.001039  \n",
      "1309  0.001059  0.001059  \n",
      "4022  0.001079  0.001079  \n",
      "3212  0.001112  0.001112  \n",
      "2521  0.001123  0.001123  \n",
      "2024  0.001134  0.001134  \n",
      "2147  0.001146  0.001146  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/666_RobertBeckebans_RBDOOM-3-BFG_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1439_pdfminer_pdfminer.six.git_README.html\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1191_backuppc_backuppc-xs.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3536_jkeenan_extutils-modulemaker.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/898_adrianlopezroche_fdupes.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/767_knik0_faad2.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3331_ClusterLabs_fence-agents_README.licence\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/139_seattlerb_ruby_parser.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2854_faye_faye_README.txt\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3795  0.002233  0.000015  0.000015  0.248725  0.000015  0.000015  0.617519   \n",
      "3966  0.029730  0.000035  0.000035  0.000035  0.000035  0.000035  0.674307   \n",
      "3648  0.540425  0.023001  0.000038  0.245106  0.034566  0.000038  0.000038   \n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "2865  0.508832  0.000047  0.000047  0.000047  0.000047  0.000047  0.038802   \n",
      "2913  0.000048  0.195954  0.000048  0.019170  0.723177  0.003377  0.058128   \n",
      "1537  0.313891  0.089990  0.000049  0.023406  0.134488  0.000049  0.047071   \n",
      "1149  0.000052  0.650270  0.000052  0.349365  0.000052  0.000052  0.000052   \n",
      "802   0.000055  0.850160  0.000055  0.149458  0.000055  0.000055  0.000055   \n",
      "10    0.000851  0.259910  0.000064  0.008255  0.000064  0.000845  0.723237   \n",
      "\n",
      "             7         8  \n",
      "3795  0.131447  0.000015  \n",
      "3966  0.000035  0.295755  \n",
      "3648  0.000038  0.156749  \n",
      "3251  0.999676  0.000040  \n",
      "2865  0.000047  0.452085  \n",
      "2913  0.000048  0.000048  \n",
      "1537  0.000049  0.391008  \n",
      "1149  0.000052  0.000052  \n",
      "802   0.000055  0.000055  \n",
      "10    0.006711  0.000064  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3091_AFLplusplus_AFLplusplus.git_README\n",
      "-----------------------Topic 3 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "4184  0.000413  0.000413  0.000413  0.996694  0.000413  0.000413  0.000413   \n",
      "634   0.000585  0.000585  0.000585  0.995319  0.000585  0.000585  0.000585   \n",
      "541   0.001059  0.001059  0.001059  0.991531  0.001058  0.001059  0.001059   \n",
      "4086  0.001069  0.001069  0.001069  0.991448  0.001069  0.001069  0.001069   \n",
      "337   0.000043  0.000042  0.033554  0.966149  0.000042  0.000042  0.000043   \n",
      "1082  0.000108  0.000108  0.038632  0.960609  0.000108  0.000108  0.000108   \n",
      "3436  0.000214  0.000214  0.000214  0.953378  0.000214  0.045126  0.000214   \n",
      "3099  0.000483  0.000483  0.000483  0.945309  0.030232  0.000483  0.000483   \n",
      "168   0.000204  0.000204  0.016534  0.944392  0.037848  0.000204  0.000204   \n",
      "805   0.000420  0.000419  0.000420  0.939900  0.057163  0.000420  0.000419   \n",
      "\n",
      "             7         8  \n",
      "4184  0.000413  0.000413  \n",
      "634   0.000585  0.000585  \n",
      "541   0.001058  0.001059  \n",
      "4086  0.001069  0.001069  \n",
      "337   0.000043  0.000043  \n",
      "1082  0.000108  0.000108  \n",
      "3436  0.000214  0.000214  \n",
      "3099  0.000483  0.021559  \n",
      "168   0.000204  0.000204  \n",
      "805   0.000419  0.000420  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3251_eproxus_meck.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1044_wolever_parameterized_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2204_easystats_parameters.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4060_perl5-utils_Params-Util_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/629_hamcrest_PyHamcrest_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4064_testing-cabal_mock_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3817_webmozart_assert_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1432_fluxx_exam_README.rst\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3966  0.029730  0.000035  0.000035  0.000035  0.000035  0.000035  0.674307   \n",
      "3355  0.044710  0.035340  0.279984  0.000040  0.000040  0.019913  0.122916   \n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "964   0.089514  0.000043  0.006234  0.000043  0.000043  0.071786  0.043128   \n",
      "2865  0.508832  0.000047  0.000047  0.000047  0.000047  0.000047  0.038802   \n",
      "3870  0.312524  0.017558  0.065654  0.000051  0.032472  0.000051  0.098971   \n",
      "3366  0.196719  0.041714  0.043868  0.000061  0.309832  0.000061  0.306935   \n",
      "1239  0.133374  0.866158  0.000067  0.000067  0.000067  0.000067  0.000067   \n",
      "2205  0.306048  0.030212  0.382652  0.000073  0.018162  0.071704  0.032824   \n",
      "1183  0.281059  0.000077  0.000077  0.000077  0.000077  0.007434  0.000077   \n",
      "\n",
      "             7         8  \n",
      "3966  0.000035  0.295755  \n",
      "3355  0.333160  0.163897  \n",
      "3251  0.999676  0.000040  \n",
      "964   0.000043  0.789164  \n",
      "2865  0.000047  0.452085  \n",
      "3870  0.455920  0.016799  \n",
      "3366  0.000061  0.100750  \n",
      "1239  0.000067  0.000067  \n",
      "2205  0.045686  0.112640  \n",
      "1183  0.000077  0.711047  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/934_AlDanial_cloc.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1926_darold_ora2pg.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1611_arno-iptables-firewall_aif.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/554_memtest86plus_memtest86plus.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2225_JamesHeinrich_getID3_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/362_mate-desktop_marco.git_README\n",
      "-----------------------Topic 4 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "331   0.000467  0.000467  0.000467  0.000467  0.996263  0.000467  0.000467   \n",
      "3569  0.000601  0.000601  0.000601  0.000601  0.995193  0.000601  0.000601   \n",
      "2222  0.000717  0.000717  0.000717  0.000717  0.994262  0.000717  0.000717   \n",
      "3364  0.000761  0.000761  0.000762  0.000762  0.993908  0.000761  0.000761   \n",
      "659   0.000806  0.000805  0.000805  0.000807  0.993555  0.000806  0.000805   \n",
      "4125  0.000849  0.000849  0.000848  0.000849  0.993211  0.000849  0.000849   \n",
      "3191  0.000975  0.000976  0.000975  0.000975  0.992198  0.000975  0.000975   \n",
      "1342  0.000992  0.000992  0.000993  0.000992  0.992061  0.000992  0.000992   \n",
      "3762  0.001090  0.001090  0.001089  0.001089  0.991284  0.001089  0.001089   \n",
      "1350  0.001135  0.001135  0.001134  0.001135  0.990924  0.001135  0.001135   \n",
      "\n",
      "             7         8  \n",
      "331   0.000467  0.000467  \n",
      "3569  0.000601  0.000601  \n",
      "2222  0.000717  0.000717  \n",
      "3364  0.000762  0.000761  \n",
      "659   0.000806  0.000806  \n",
      "4125  0.000849  0.000848  \n",
      "3191  0.000975  0.000975  \n",
      "1342  0.000992  0.000992  \n",
      "3762  0.001089  0.001090  \n",
      "1350  0.001134  0.001135  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3145_tkem_mopidy-dleyna.git_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2600_spyder-ide_qtpy_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/953_rroemhild_flask-ldapconn_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3185_mopidy_mopidy-scrobbler.git_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2310_tduehr_omniauth-cas3_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2822_gawel_panoramisk.git_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3648_tkem_mopidy-podcast-itunes_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3969_kingosticks_mopidy-tunein_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3103_erdewit_nest_asyncio_README.rst\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3795  0.002233  0.000015  0.000015  0.248725  0.000015  0.000015  0.617519   \n",
      "3966  0.029730  0.000035  0.000035  0.000035  0.000035  0.000035  0.674307   \n",
      "3027  0.042031  0.000039  0.110408  0.114454  0.000039  0.015444  0.000039   \n",
      "3355  0.044710  0.035340  0.279984  0.000040  0.000040  0.019913  0.122916   \n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "337   0.000043  0.000042  0.033554  0.966149  0.000042  0.000042  0.000043   \n",
      "964   0.089514  0.000043  0.006234  0.000043  0.000043  0.071786  0.043128   \n",
      "2865  0.508832  0.000047  0.000047  0.000047  0.000047  0.000047  0.038802   \n",
      "1149  0.000052  0.650270  0.000052  0.349365  0.000052  0.000052  0.000052   \n",
      "802   0.000055  0.850160  0.000055  0.149458  0.000055  0.000055  0.000055   \n",
      "\n",
      "             7         8  \n",
      "3795  0.131447  0.000015  \n",
      "3966  0.000035  0.295755  \n",
      "3027  0.717508  0.000039  \n",
      "3355  0.333160  0.163897  \n",
      "3251  0.999676  0.000040  \n",
      "337   0.000043  0.000043  \n",
      "964   0.000043  0.789164  \n",
      "2865  0.000047  0.452085  \n",
      "1149  0.000052  0.000052  \n",
      "802   0.000055  0.000055  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/934_AlDanial_cloc.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
      "-----------------------Topic 5 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3370  0.000229  0.000229  0.000229  0.000229  0.000229  0.998166  0.000229   \n",
      "1017  0.000333  0.000333  0.000333  0.000333  0.000333  0.997338  0.000333   \n",
      "3174  0.000423  0.000423  0.000423  0.000423  0.000423  0.996619  0.000423   \n",
      "717   0.001710  0.001711  0.001710  0.001710  0.001711  0.986318  0.001710   \n",
      "4132  0.002268  0.002269  0.002268  0.002269  0.002269  0.981849  0.002269   \n",
      "3     0.002778  0.002779  0.002779  0.002780  0.002779  0.977767  0.002779   \n",
      "1555  0.002925  0.002925  0.002924  0.002925  0.002925  0.976601  0.002926   \n",
      "1336  0.003969  0.003970  0.003969  0.003972  0.003970  0.968244  0.003969   \n",
      "705   0.005565  0.005560  0.005561  0.005569  0.005559  0.955511  0.005559   \n",
      "2220  0.006536  0.006537  0.006541  0.006540  0.006537  0.947699  0.006536   \n",
      "\n",
      "             7         8  \n",
      "3370  0.000229  0.000229  \n",
      "1017  0.000333  0.000333  \n",
      "3174  0.000423  0.000423  \n",
      "717   0.001710  0.001710  \n",
      "4132  0.002269  0.002269  \n",
      "3     0.002779  0.002780  \n",
      "1555  0.002925  0.002925  \n",
      "1336  0.003969  0.003969  \n",
      "705   0.005558  0.005557  \n",
      "2220  0.006537  0.006536  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2594_crsmithdev_arrow.git_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2879_jonschlinkert_map-visit_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1625_keis_base58_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/162_sebastianbergmann_comparator_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1075_sebastianbergmann_object-enumerator_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/830_npm_abbrev-js_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1722_unclechu_node-deep-extend.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2412_crosswire-bible-society_nave_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4251_hughsk_is-typedarray_README.md\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3795  0.002233  0.000015  0.000015  0.248725  0.000015  0.000015  0.617519   \n",
      "3966  0.029730  0.000035  0.000035  0.000035  0.000035  0.000035  0.674307   \n",
      "3648  0.540425  0.023001  0.000038  0.245106  0.034566  0.000038  0.000038   \n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "337   0.000043  0.000042  0.033554  0.966149  0.000042  0.000042  0.000043   \n",
      "2865  0.508832  0.000047  0.000047  0.000047  0.000047  0.000047  0.038802   \n",
      "1537  0.313891  0.089990  0.000049  0.023406  0.134488  0.000049  0.047071   \n",
      "3870  0.312524  0.017558  0.065654  0.000051  0.032472  0.000051  0.098971   \n",
      "1149  0.000052  0.650270  0.000052  0.349365  0.000052  0.000052  0.000052   \n",
      "802   0.000055  0.850160  0.000055  0.149458  0.000055  0.000055  0.000055   \n",
      "\n",
      "             7         8  \n",
      "3795  0.131447  0.000015  \n",
      "3966  0.000035  0.295755  \n",
      "3648  0.000038  0.156749  \n",
      "3251  0.999676  0.000040  \n",
      "337   0.000043  0.000043  \n",
      "2865  0.000047  0.452085  \n",
      "1537  0.000049  0.391008  \n",
      "3870  0.455920  0.016799  \n",
      "1149  0.000052  0.000052  \n",
      "802   0.000055  0.000055  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1926_darold_ora2pg.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
      "-----------------------Topic 6 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "1893  0.004116  0.004116  0.004116  0.004116  0.004119  0.004116  0.967068   \n",
      "2605  0.000456  0.000456  0.000456  0.000456  0.000456  0.031675  0.965135   \n",
      "2308  0.004630  0.004630  0.004630  0.004630  0.004634  0.004630  0.962953   \n",
      "2446  0.004630  0.004630  0.004630  0.004630  0.004634  0.004630  0.962953   \n",
      "4152  0.004630  0.004630  0.004630  0.004630  0.004634  0.004630  0.962953   \n",
      "371   0.005053  0.005053  0.005053  0.005051  0.005053  0.005052  0.959573   \n",
      "1589  0.006946  0.006946  0.006947  0.006945  0.006947  0.006945  0.944423   \n",
      "3111  0.006950  0.006946  0.006947  0.006950  0.006950  0.006950  0.944408   \n",
      "3995  0.056876  0.000100  0.000100  0.000100  0.000100  0.000100  0.942423   \n",
      "3135  0.007414  0.007415  0.007409  0.007414  0.007411  0.007411  0.940706   \n",
      "\n",
      "             7         8  \n",
      "1893  0.004116  0.004117  \n",
      "2605  0.000456  0.000456  \n",
      "2308  0.004630  0.004631  \n",
      "2446  0.004630  0.004631  \n",
      "4152  0.004630  0.004631  \n",
      "371   0.005052  0.005059  \n",
      "1589  0.006949  0.006953  \n",
      "3111  0.006952  0.006948  \n",
      "3995  0.000100  0.000100  \n",
      "3135  0.007410  0.007411  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1607_ppentchev_feature-check_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2577_kodi-pvr_pvr.dvbviewer.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2363_kodi-pvr_pvr.hdhomerun.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3414_kodi-pvr_pvr.njoy.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3527_hackerschoice_THC-Archive_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2455_tcolar_wmfrog_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2103_aperezdc_signify.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/119_linux-thinkpad_tp_smapi_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3587_igaw_jitterdebugger.git_README\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3597  0.520753  0.030707  0.076996  0.009554  0.064448  0.004423  0.000021   \n",
      "3648  0.540425  0.023001  0.000038  0.245106  0.034566  0.000038  0.000038   \n",
      "3027  0.042031  0.000039  0.110408  0.114454  0.000039  0.015444  0.000039   \n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "337   0.000043  0.000042  0.033554  0.966149  0.000042  0.000042  0.000043   \n",
      "1149  0.000052  0.650270  0.000052  0.349365  0.000052  0.000052  0.000052   \n",
      "802   0.000055  0.850160  0.000055  0.149458  0.000055  0.000055  0.000055   \n",
      "4047  0.672539  0.165141  0.000064  0.028149  0.043231  0.000064  0.000064   \n",
      "661   0.300668  0.000066  0.010081  0.616868  0.000066  0.072053  0.000066   \n",
      "1239  0.133374  0.866158  0.000067  0.000067  0.000067  0.000067  0.000067   \n",
      "\n",
      "             7         8  \n",
      "3597  0.000021  0.293076  \n",
      "3648  0.000038  0.156749  \n",
      "3027  0.717508  0.000039  \n",
      "3251  0.999676  0.000040  \n",
      "337   0.000043  0.000043  \n",
      "1149  0.000052  0.000052  \n",
      "802   0.000055  0.000055  \n",
      "4047  0.090683  0.000064  \n",
      "661   0.000066  0.000066  \n",
      "1239  0.000067  0.000067  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3624_audacity_audacity.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2523_jeancroy_fuzzaldrin-plus_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/554_memtest86plus_memtest86plus.git_README.md\n",
      "-----------------------Topic 7 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "2445  0.000363  0.000363  0.000363  0.000363  0.000363  0.000363  0.000363   \n",
      "1971  0.001001  0.001002  0.001001  0.001002  0.001001  0.001002  0.001001   \n",
      "2288  0.001635  0.001635  0.001634  0.001635  0.001635  0.001636  0.001635   \n",
      "52    0.002138  0.002139  0.002138  0.002140  0.002138  0.002138  0.002138   \n",
      "2276  0.023188  0.000155  0.000155  0.000155  0.000155  0.000155  0.000155   \n",
      "1682  0.000772  0.000772  0.000772  0.000772  0.039993  0.000772  0.000772   \n",
      "3082  0.009264  0.009261  0.009262  0.009265  0.009266  0.009262  0.009269   \n",
      "1117  0.084798  0.000643  0.000642  0.000643  0.000643  0.000642  0.000643   \n",
      "2480  0.000185  0.000185  0.000185  0.000185  0.000185  0.000185  0.000185   \n",
      "\n",
      "             7         8  \n",
      "3251  0.999676  0.000040  \n",
      "2445  0.997094  0.000363  \n",
      "1971  0.991989  0.001001  \n",
      "2288  0.986920  0.001635  \n",
      "52    0.982894  0.002138  \n",
      "2276  0.975728  0.000155  \n",
      "1682  0.954604  0.000772  \n",
      "3082  0.925882  0.009269  \n",
      "1117  0.910704  0.000642  \n",
      "2480  0.909884  0.088824  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3104_scrapy-plugins_scrapy-djangoitem_README.rst\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1445_carljm_django-model-utils.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/864_bfirsh_django-ordered-model.git_README.markdown\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3002_ionelmc_python-darkslide.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4196_alexott_muse_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/995_coleifer_wtf-peewee_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/811_sopel-irc_sopel.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1019_jazzband_django-sortedm2m.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/356_wanderlust_apel_README.en\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3597  0.520753  0.030707  0.076996  0.009554  0.064448  0.004423  0.000021   \n",
      "3966  0.029730  0.000035  0.000035  0.000035  0.000035  0.000035  0.674307   \n",
      "3648  0.540425  0.023001  0.000038  0.245106  0.034566  0.000038  0.000038   \n",
      "337   0.000043  0.000042  0.033554  0.966149  0.000042  0.000042  0.000043   \n",
      "964   0.089514  0.000043  0.006234  0.000043  0.000043  0.071786  0.043128   \n",
      "2865  0.508832  0.000047  0.000047  0.000047  0.000047  0.000047  0.038802   \n",
      "2913  0.000048  0.195954  0.000048  0.019170  0.723177  0.003377  0.058128   \n",
      "1537  0.313891  0.089990  0.000049  0.023406  0.134488  0.000049  0.047071   \n",
      "1149  0.000052  0.650270  0.000052  0.349365  0.000052  0.000052  0.000052   \n",
      "802   0.000055  0.850160  0.000055  0.149458  0.000055  0.000055  0.000055   \n",
      "\n",
      "             7         8  \n",
      "3597  0.000021  0.293076  \n",
      "3966  0.000035  0.295755  \n",
      "3648  0.000038  0.156749  \n",
      "337   0.000043  0.000043  \n",
      "964   0.000043  0.789164  \n",
      "2865  0.000047  0.452085  \n",
      "2913  0.000048  0.000048  \n",
      "1537  0.000049  0.391008  \n",
      "1149  0.000052  0.000052  \n",
      "802   0.000055  0.000055  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3624_audacity_audacity.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
      "-----------------------Topic 8 --------------------------------\n",
      "             0         1         2         3         4         5         6  \\\n",
      "140   0.000299  0.000299  0.000299  0.000299  0.000299  0.000299  0.000299   \n",
      "444   0.000475  0.000475  0.000475  0.000475  0.000475  0.000475  0.000475   \n",
      "124   0.000517  0.000517  0.000517  0.000517  0.000517  0.000517  0.000517   \n",
      "2319  0.000559  0.000559  0.000559  0.000559  0.000559  0.000559  0.000559   \n",
      "3911  0.000564  0.000564  0.000564  0.000564  0.000564  0.000564  0.000564   \n",
      "2132  0.000621  0.000621  0.000621  0.000621  0.000621  0.000621  0.000621   \n",
      "75    0.000635  0.000635  0.000635  0.000635  0.000635  0.000635  0.000635   \n",
      "529   0.000635  0.000635  0.000635  0.000635  0.000635  0.000635  0.000635   \n",
      "2202  0.000642  0.000642  0.000642  0.000643  0.000642  0.000643  0.000643   \n",
      "3084  0.000642  0.000642  0.000642  0.000643  0.000642  0.000643  0.000643   \n",
      "\n",
      "             7         8  \n",
      "140   0.000299  0.997609  \n",
      "444   0.000475  0.996200  \n",
      "124   0.000517  0.995864  \n",
      "2319  0.000559  0.995531  \n",
      "3911  0.000565  0.995485  \n",
      "2132  0.000621  0.995032  \n",
      "75    0.000635  0.994919  \n",
      "529   0.000635  0.994919  \n",
      "2202  0.000643  0.994860  \n",
      "3084  0.000643  0.994860  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3202_extensions_dune-grid-glue_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2977_airspy_airspyone_host.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2136_wxMaxima-developers_wxmaxima_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2068_math-comp_math-comp_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2814_MaartenBaert_ssr_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2954_core_dune-localfunctions_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1140_core_dune-geometry_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/305_staging_dune-typetree_README.GIT\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4205_staging_dune-functions_README\n",
      "             0         1         2         3         4         5         6  \\\n",
      "3795  0.002233  0.000015  0.000015  0.248725  0.000015  0.000015  0.617519   \n",
      "3027  0.042031  0.000039  0.110408  0.114454  0.000039  0.015444  0.000039   \n",
      "3251  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040  0.000040   \n",
      "337   0.000043  0.000042  0.033554  0.966149  0.000042  0.000042  0.000043   \n",
      "2913  0.000048  0.195954  0.000048  0.019170  0.723177  0.003377  0.058128   \n",
      "1149  0.000052  0.650270  0.000052  0.349365  0.000052  0.000052  0.000052   \n",
      "802   0.000055  0.850160  0.000055  0.149458  0.000055  0.000055  0.000055   \n",
      "10    0.000851  0.259910  0.000064  0.008255  0.000064  0.000845  0.723237   \n",
      "4047  0.672539  0.165141  0.000064  0.028149  0.043231  0.000064  0.000064   \n",
      "661   0.300668  0.000066  0.010081  0.616868  0.000066  0.072053  0.000066   \n",
      "\n",
      "             7         8  \n",
      "3795  0.131447  0.000015  \n",
      "3027  0.717508  0.000039  \n",
      "3251  0.999676  0.000040  \n",
      "337   0.000043  0.000043  \n",
      "2913  0.000048  0.000048  \n",
      "1149  0.000052  0.000052  \n",
      "802   0.000055  0.000055  \n",
      "10    0.006711  0.000064  \n",
      "4047  0.090683  0.000064  \n",
      "661   0.000066  0.000066  \n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3091_AFLplusplus_AFLplusplus.git_README\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2523_jeancroy_fuzzaldrin-plus_README.md\n",
      "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n",
      "0    0.073153\n",
      "1    0.097928\n",
      "2    0.096995\n",
      "3    0.076255\n",
      "4    0.141707\n",
      "5    0.072689\n",
      "6    0.093970\n",
      "7    0.072439\n",
      "8    0.274864\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "prevalent_topics(data_vectorized, file_list)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}