1200 lines
72 KiB
Plaintext
1200 lines
72 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "e09a84d6-cbd4-4a12-8e96-3775f734a262",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import glob\n",
|
|
"import copy\n",
|
|
"import csv\n",
|
|
"from statistics import mean, median\n",
|
|
"from strip_markdown import strip_markdown\n",
|
|
"import joblib"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "9483091c-ac72-415c-932d-ac7cf7970789",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import gensim\n",
|
|
"import gensim.corpora as corpora\n",
|
|
"from gensim.utils import simple_preprocess\n",
|
|
"from gensim.models import CoherenceModel\n",
|
|
"from gensim.models.phrases import Phrases\n",
|
|
"\n",
|
|
"from sklearn.decomposition import LatentDirichletAllocation\n",
|
|
"from sklearn.model_selection import GridSearchCV\n",
|
|
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
|
|
"\n",
|
|
"from statistics import mode\n",
|
|
"\n",
|
|
"from collections import defaultdict"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "196abd6a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#import nltk\n",
|
|
"#nltk.download('wordnet')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "3da6b590-875d-478d-aaaa-de020039c519",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# spacy and nltk for lemmatization\n",
|
|
"import nltk \n",
|
|
"#nltk.download('stopwords')\n",
|
|
"import spacy\n",
|
|
"from nltk.corpus import stopwords\n",
|
|
"from nltk.stem.wordnet import WordNetLemmatizer\n",
|
|
"\n",
|
|
"stopwords = stopwords.words('english')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "60c137ae-6fe9-4b03-b899-6141b1645d6b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def metadata_for_file(file):\n",
|
|
" word_list = file.split()\n",
|
|
" word_count = len(word_list)\n",
|
|
" #print(word_list)\n",
|
|
" if word_count == 0:\n",
|
|
" avg_word_length = 0\n",
|
|
" else: \n",
|
|
" avg_word_length = sum(map(len, word_list)) / len(word_list)\n",
|
|
" #return number of paragraphs\n",
|
|
" return word_count, avg_word_length, word_list"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "2e674fef-adb4-48c9-86a0-a655c41a95f3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_data_from_dir(directory):\n",
|
|
" files = glob.glob(f\"{directory}/*\")\n",
|
|
" data_list = []\n",
|
|
" word_counts = []\n",
|
|
" avg_word_lengths = []\n",
|
|
" file_list = []\n",
|
|
" files_word_lists = defaultdict(list)\n",
|
|
" for file in files:\n",
|
|
" text = open(file, encoding='utf-8', errors='ignore').read()\n",
|
|
" #here's some of the descriptive text analysis\n",
|
|
" word_count, avg_word_length, word_list = metadata_for_file(text)\n",
|
|
" word_counts.append(word_count)\n",
|
|
" avg_word_lengths.append(avg_word_length)\n",
|
|
" #adding the data to the list of text\n",
|
|
" if word_count > 0:\n",
|
|
" files_word_lists[tuple(word_list)].append(file)\n",
|
|
" data_list.append(text)\n",
|
|
" #adding filename\n",
|
|
" file_list.append(file)\n",
|
|
" return data_list, word_counts, avg_word_lengths, file_list, "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "2b332b10-bfc8-4566-8c52-19a8a334af00",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#preprocessing text data\n",
|
|
"def preprocess(corpus_list):\n",
|
|
" #extending stopwords \n",
|
|
" specific_stopwords = [\"http\", \"com\", \"www\", \"org\", \"file\", \"code\", \"time\", \"software\", \"use\", \"user\", \"set\", \"line\", \"run\", \"source\", \"github\",\n",
|
|
" \"lineno\", \"python\", \"php\", \"ruby\", \"api\"]\n",
|
|
" stopwords.extend(specific_stopwords)\n",
|
|
" D = copy.copy(corpus_list)\n",
|
|
" #stripping markdown from documents\n",
|
|
" D = [strip_markdown(doc) for doc in D]\n",
|
|
" #strip html \n",
|
|
" D = [re.sub(r'<[^<]+?>', '', doc, flags=re.DOTALL) for doc in D]\n",
|
|
" #mvp right now, can certainly be expanded as iterations of text analysis are done\n",
|
|
" D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]\n",
|
|
" lemmatizer = WordNetLemmatizer()\n",
|
|
" D_lemma = [\" \".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]\n",
|
|
" return D_lemma"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "2a26b7ef-d2df-4e1d-8aeb-66706ac6cbb7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#preparing processed data for model usage\n",
|
|
"def text_preparation(lemmatized_text):\n",
|
|
" #bigrams\n",
|
|
" D_bigrams = copy.copy(lemmatized_text)\n",
|
|
" bigram = Phrases(D_bigrams, min_count=2)\n",
|
|
" for i in range(len(lemmatized_text)):\n",
|
|
" for token in bigram[D_bigrams[i]]:\n",
|
|
" if '_' in token:\n",
|
|
" D_bigrams[i].append(token)\n",
|
|
" #id2word\n",
|
|
" id2word = corpora.Dictionary(D_bigrams)\n",
|
|
" id2word.filter_extremes(no_below=5, no_above=0.5)\n",
|
|
" #bow representation \n",
|
|
" bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]\n",
|
|
" return bag_of_words, id2word"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "24799e25-2c0c-4e16-b503-68296f604f52",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def lda_model_identification(data_vectorized):\n",
|
|
" lda = LatentDirichletAllocation()\n",
|
|
" search_params = {'n_components': [9], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256] }\n",
|
|
" model = GridSearchCV(lda, param_grid=search_params, verbose=10)\n",
|
|
" model.fit(data_vectorized)\n",
|
|
" best_lda_model = model.best_estimator_\n",
|
|
" print(\"Best Model's Params: \", model.best_params_)\n",
|
|
" print(\"Best Log Likelihood Score: \", model.best_score_)\n",
|
|
" print(\"Model Perplexity: \", best_lda_model.perplexity(data_vectorized))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "d3b5785f-8272-44f5-9aee-e5f5e97452e5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def best_lda_model(data_vectorized, vocab):\n",
|
|
" lda = LatentDirichletAllocation(n_components=9, learning_decay = 0.7, batch_size = 128, max_iter = 50)\n",
|
|
" id_topic = lda.fit_transform(data_vectorized)\n",
|
|
" topic_words = {}\n",
|
|
" for topic, comp in enumerate(lda.components_):\n",
|
|
" word_idx = np.argsort(comp)[::-1][:10]\n",
|
|
" topic_words[topic] = [vocab[i] for i in word_idx]\n",
|
|
" for topic, words in topic_words.items():\n",
|
|
" print('Topic: %d' % topic)\n",
|
|
" print(' %s' % ', '.join(words))\n",
|
|
" #lda.print_topics(num_words=10)\n",
|
|
" joblib.dump(lda, '020325_README_lda.jl')\n",
|
|
" #lda = joblib.load('0509_lda.jl')\n",
|
|
" return id_topic"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"id": "80bcdc6c-8a3d-4738-87b5-15e6c2db3a27",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_most_prevalent(vect_documents, documents):\n",
|
|
" lda = joblib.load('020725_README_lda.jl')\n",
|
|
" distributions = lda.transform(vect_documents)\n",
|
|
" most_prevalent = {0: [0, \"\"],1: [0, \"\"], 2: [0, \"\"], 3: [0, \"\"], 4: [0, \"\"], 5: [0, \"\"], 6: [0, \"\"], 7: [0, \"\"], 8: [0, \"\"]}\n",
|
|
" for i, topic_distribution in enumerate(distributions):\n",
|
|
" for j in range(9):\n",
|
|
" if topic_distribution[j] > most_prevalent[j][0]:\n",
|
|
" most_prevalent[j] = [topic_distribution[j], documents[i]]\n",
|
|
" print(most_prevalent)\n",
|
|
" return most_prevalent\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"id": "3afd27af-8e8f-43c0-8610-06f7a68d5aec",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def prevalent_topics(vect_documents, file_list):\n",
|
|
" lda = joblib.load('020725_README_lda.jl')\n",
|
|
" #lda = joblib.load('0514_contrib_lda.jl')\n",
|
|
" distributions = lda.transform(vect_documents)\n",
|
|
" #figuring out what the max distribution is and then figuring out the mode\n",
|
|
" top_topic = []\n",
|
|
" count_of_multiple = 0\n",
|
|
" topic_arrays = []\n",
|
|
" for i, topic_distribution in enumerate(distributions):\n",
|
|
" max_dist = max(topic_distribution)\n",
|
|
" indexes = np.where(topic_distribution == max_dist)[0]\n",
|
|
" if len(indexes) == 1:\n",
|
|
" top_topic.append(indexes[0])\n",
|
|
" else:\n",
|
|
" count_of_multiple += 1\n",
|
|
" topic_arrays.append(topic_distribution)\n",
|
|
" #most_frequent(top_topic)\n",
|
|
" print(count_of_multiple)\n",
|
|
" df = pd.DataFrame(topic_arrays)\n",
|
|
" #finding the distribution values for all documents\n",
|
|
" with open('020725_README_file_topic_distributions.csv', 'w', newline='') as csvfile:\n",
|
|
" fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8']\n",
|
|
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
|
|
" writer.writeheader()\n",
|
|
" for i, row in df.iterrows():\n",
|
|
" project_dir = {}\n",
|
|
" project_dir['filename'] = file_list[i].split(\"/\")[-1]\n",
|
|
" array_row = df.iloc[i].to_numpy()\n",
|
|
" for j in range(9):\n",
|
|
" project_dir[\"t\" + str(j)] = array_row[j]\n",
|
|
" writer.writerow(project_dir)\n",
|
|
" #print(df.sort_values(by=['0']).head(5))\n",
|
|
" for i in range(9):\n",
|
|
" print(\"-----------------------Topic \" + str(i) + \" --------------------------------\")\n",
|
|
" top5 = df.nlargest(10, i)\n",
|
|
" top_indices = top5.index.to_list()\n",
|
|
" print(top5)\n",
|
|
" for index in top_indices:\n",
|
|
" print(file_list[index])\n",
|
|
" bottom5 = df.nsmallest(10, i)\n",
|
|
" bottom_indices = bottom5.index.to_list()\n",
|
|
" print(bottom5)\n",
|
|
" for index in bottom_indices:\n",
|
|
" print(file_list[index])\n",
|
|
" averages = df.mean()\n",
|
|
" print(averages)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "5aefbafc-0c1b-409a-afbc-655e4cef91e3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def most_frequent(topic_prevalence):\n",
|
|
" most_frequent_array = []\n",
|
|
" for j in range(11):\n",
|
|
" topic = mode(topic_prevalence)\n",
|
|
" most_frequent_array.append(topic)\n",
|
|
" topic_prevalence = [i for i in topic_prevalence if i != topic]\n",
|
|
" print(most_frequent_array)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "69d606fd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "1f937c2e-2714-475d-b670-602164c46642",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Mean wordcount: 324.0929957406531\n",
|
|
"Median wordcount: 156.0\n",
|
|
"Mean wordlength: 6.354120246310486\n",
|
|
"Median wordlength: 5.950514528900827\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/SOC.NORTHWESTERN.EDU/nws8519/anaconda3/lib/python3.12/html/parser.py:171: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n",
|
|
" k = self.parse_starttag(i)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"listed_corpus, wordcounts, wordlengths, file_list= get_data_from_dir(readme_directory)\n",
|
|
"print(\"Mean wordcount: \", mean(wordcounts))\n",
|
|
"print(\"Median wordcount: \", median(wordcounts))\n",
|
|
"print(\"Mean wordlength: \", mean(wordlengths))\n",
|
|
"print(\"Median wordlength: \", median(wordlengths))\n",
|
|
"lemmatized_corpus = preprocess(listed_corpus)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e90e236f-8db5-40cc-88a3-60e674b9d1de",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['020725_README_vectorizer.joblib']"
|
|
]
|
|
},
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"'''\n",
|
|
"vectorizer = CountVectorizer(analyzer='word', \n",
|
|
" min_df=2, \n",
|
|
" stop_words='english', \n",
|
|
" lowercase=True, \n",
|
|
" token_pattern='[a-zA-Z0-9]{2,}', \n",
|
|
" )\n",
|
|
"data_vectorized = vectorizer.fit_transform(lemmatized_corpus)\n",
|
|
"joblib.dump(vectorizer, '020725_README_vectorizer.joblib')\n",
|
|
"'''\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "d68aaf7b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"vectorizer = joblib.load('020725_README_vectorizer.joblib')\n",
|
|
"data_vectorized = vectorizer.transform(lemmatized_corpus) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "dd1a70c2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
|
|
"[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1158862.039 total time= 17.6s\n",
|
|
"[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1121276.805 total time= 12.0s\n",
|
|
"[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1058330.478 total time= 12.6s\n",
|
|
"[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1169073.807 total time= 12.7s\n",
|
|
"[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1308701.275 total time= 11.9s\n",
|
|
"[CV 1/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 1/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1157991.152 total time= 11.8s\n",
|
|
"[CV 2/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 2/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1120570.803 total time= 11.7s\n",
|
|
"[CV 3/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 3/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1055699.316 total time= 12.4s\n",
|
|
"[CV 4/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 4/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1168297.207 total time= 11.7s\n",
|
|
"[CV 5/5; 2/6] START batch_size=128, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 5/5; 2/6] END batch_size=128, learning_decay=0.7, n_components=9;, score=-1307949.520 total time= 12.4s\n",
|
|
"[CV 1/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 1/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1157830.351 total time= 11.7s\n",
|
|
"[CV 2/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 2/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1124221.589 total time= 11.8s\n",
|
|
"[CV 3/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 3/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1056916.516 total time= 12.0s\n",
|
|
"[CV 4/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 4/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1169168.331 total time= 12.9s\n",
|
|
"[CV 5/5; 3/6] START batch_size=128, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 5/5; 3/6] END batch_size=128, learning_decay=0.9, n_components=9;, score=-1308175.234 total time= 12.5s\n",
|
|
"[CV 1/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 1/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1158540.475 total time= 12.2s\n",
|
|
"[CV 2/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 2/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1120071.919 total time= 11.8s\n",
|
|
"[CV 3/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 3/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1061777.082 total time= 12.3s\n",
|
|
"[CV 4/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 4/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1170380.631 total time= 11.6s\n",
|
|
"[CV 5/5; 4/6] START batch_size=256, learning_decay=0.5, n_components=9..........\n",
|
|
"[CV 5/5; 4/6] END batch_size=256, learning_decay=0.5, n_components=9;, score=-1307034.410 total time= 11.6s\n",
|
|
"[CV 1/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 1/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1156265.357 total time= 13.2s\n",
|
|
"[CV 2/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 2/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1121786.140 total time= 11.8s\n",
|
|
"[CV 3/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 3/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1060299.402 total time= 12.5s\n",
|
|
"[CV 4/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 4/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1174913.458 total time= 12.1s\n",
|
|
"[CV 5/5; 5/6] START batch_size=256, learning_decay=0.7, n_components=9..........\n",
|
|
"[CV 5/5; 5/6] END batch_size=256, learning_decay=0.7, n_components=9;, score=-1305421.859 total time= 11.6s\n",
|
|
"[CV 1/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 1/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1158280.857 total time= 12.0s\n",
|
|
"[CV 2/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 2/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1117369.387 total time= 11.5s\n",
|
|
"[CV 3/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 3/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1058230.568 total time= 12.0s\n",
|
|
"[CV 4/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 4/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1171980.166 total time= 11.9s\n",
|
|
"[CV 5/5; 6/6] START batch_size=256, learning_decay=0.9, n_components=9..........\n",
|
|
"[CV 5/5; 6/6] END batch_size=256, learning_decay=0.9, n_components=9;, score=-1305822.528 total time= 11.3s\n",
|
|
"Best Model's Params: {'batch_size': 128, 'learning_decay': 0.7, 'n_components': 9}\n",
|
|
"Best Log Likelihood Score: -1162101.5996668166\n",
|
|
"Model Perplexity: 2176.5064559983784\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"lda_model_identification(data_vectorized)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "aa83d20f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Topic: 0\n",
|
|
" image, data, key, file, color, option, support, format, default, mode\n",
|
|
"Topic: 1\n",
|
|
" data, test, library, object, implementation, support, packet, used, byte, class\n",
|
|
"Topic: 2\n",
|
|
" license, copyright, perl, gnu, free, version, module, public, general, warranty\n",
|
|
"Topic: 3\n",
|
|
" test, value, function, return, method, class, string, type, object, example\n",
|
|
"Topic: 4\n",
|
|
" http, git, server, install, client, request, test, version, project, command\n",
|
|
"Topic: 5\n",
|
|
" json, node, require, string, parser, var, object, parse, function, font\n",
|
|
"Topic: 6\n",
|
|
" command, output, option, process, make, program, script, tool, file, linux\n",
|
|
"Topic: 7\n",
|
|
" table, html, tag, text, django, xml, example, path, template, default\n",
|
|
"Topic: 8\n",
|
|
" install, make, build, library, version, directory, file, package, window, project\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"id": "f4345bd6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{0: [0.9963399069190733, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes'], 1: [0.9987558745140913, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README'], 2: [0.999271074201955, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt'], 3: [0.9966940236237574, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md'], 4: [0.9962628678061417, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst'], 5: [0.998166117886522, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst'], 6: [0.9670683884278027, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md'], 7: [0.9996764637160757, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md'], 8: [0.9976094391943828, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX']}\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{0: [0.9963399069190733,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes'],\n",
|
|
" 1: [0.9987558745140913,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README'],\n",
|
|
" 2: [0.999271074201955,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt'],\n",
|
|
" 3: [0.9966940236237574,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md'],\n",
|
|
" 4: [0.9962628678061417,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst'],\n",
|
|
" 5: [0.998166117886522,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst'],\n",
|
|
" 6: [0.9670683884278027,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md'],\n",
|
|
" 7: [0.9996764637160757,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md'],\n",
|
|
" 8: [0.9976094391943828,\n",
|
|
" '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX']}"
|
|
]
|
|
},
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"get_most_prevalent(data_vectorized, file_list)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"id": "23468e82",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"183\n",
|
|
"-----------------------Topic 0 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3142 0.996340 0.000457 0.000458 0.000458 0.000458 0.000457 0.000457 \n",
|
|
"810 0.995085 0.000614 0.000614 0.000614 0.000614 0.000614 0.000614 \n",
|
|
"3064 0.983533 0.002058 0.002058 0.002059 0.002059 0.002058 0.002059 \n",
|
|
"2980 0.960597 0.000512 0.035817 0.000512 0.000513 0.000512 0.000512 \n",
|
|
"197 0.892184 0.000950 0.000951 0.000950 0.000950 0.101164 0.000950 \n",
|
|
"131 0.867562 0.001765 0.001764 0.036755 0.001764 0.001765 0.001765 \n",
|
|
"3694 0.864345 0.001390 0.001390 0.001389 0.001390 0.049473 0.001390 \n",
|
|
"582 0.857786 0.000105 0.064223 0.043669 0.000105 0.000105 0.000105 \n",
|
|
"3026 0.851801 0.018529 0.018519 0.018536 0.018522 0.018532 0.018522 \n",
|
|
"1647 0.851778 0.018530 0.018519 0.018530 0.018528 0.018532 0.018540 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3142 0.000458 0.000458 \n",
|
|
"810 0.000614 0.000615 \n",
|
|
"3064 0.002059 0.002058 \n",
|
|
"2980 0.000512 0.000512 \n",
|
|
"197 0.000950 0.000950 \n",
|
|
"131 0.085096 0.001765 \n",
|
|
"3694 0.001389 0.077843 \n",
|
|
"582 0.033796 0.000105 \n",
|
|
"3026 0.018521 0.018519 \n",
|
|
"1647 0.018521 0.018523 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2362_plasma_breeze-plymouth.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1356_katomic.git_README.levels\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/541_dunst-project_dunst_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2497_borntyping_python-colorlog.git_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2421_whipper-team_whipper.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3297_takaswie_hinawa-utils.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/697_tiwai_awesfx.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2831_kilobyte_pmemkv_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2095_warner_magic-wormhole.git_README.md\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n",
|
|
"37 0.000045 0.164319 0.053585 0.504039 0.040306 0.006119 0.171102 \n",
|
|
"2913 0.000048 0.195954 0.000048 0.019170 0.723177 0.003377 0.058128 \n",
|
|
"1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n",
|
|
"802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n",
|
|
"3287 0.000070 0.000070 0.007213 0.190779 0.000070 0.028435 0.027117 \n",
|
|
"455 0.000071 0.481662 0.000071 0.030261 0.000071 0.000071 0.336087 \n",
|
|
"1313 0.000076 0.047031 0.000076 0.097269 0.012519 0.034803 0.056043 \n",
|
|
"1598 0.000079 0.000079 0.000079 0.542049 0.000079 0.000079 0.000079 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"337 0.000043 0.000043 \n",
|
|
"37 0.005875 0.054610 \n",
|
|
"2913 0.000048 0.000048 \n",
|
|
"1149 0.000052 0.000052 \n",
|
|
"802 0.000055 0.000055 \n",
|
|
"3287 0.746179 0.000070 \n",
|
|
"455 0.000071 0.151635 \n",
|
|
"1313 0.104692 0.647491 \n",
|
|
"1598 0.000079 0.457399 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/722_zeromq_czmq.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2774_timlegge_perl-XML-Generator_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3509_jirka-h_haveged_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/511_mmottl_ocaml-makefile.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4029_yayahjb_cqrlib.git_README_CQRlib.html\n",
|
|
"-----------------------Topic 1 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"328 0.000156 0.998756 0.000155 0.000156 0.000156 0.000156 0.000156 \n",
|
|
"2081 0.000601 0.995192 0.000601 0.000601 0.000601 0.000601 0.000601 \n",
|
|
"2672 0.000862 0.993105 0.000862 0.000862 0.000862 0.000862 0.000862 \n",
|
|
"389 0.004276 0.965797 0.004276 0.004275 0.004274 0.004275 0.004277 \n",
|
|
"3087 0.005558 0.955534 0.005557 0.005556 0.005560 0.005558 0.005559 \n",
|
|
"2478 0.006538 0.947694 0.006537 0.006539 0.006538 0.006538 0.006538 \n",
|
|
"2272 0.006955 0.944406 0.006946 0.006946 0.006950 0.006947 0.006950 \n",
|
|
"853 0.000350 0.932979 0.000350 0.000350 0.000350 0.000350 0.000350 \n",
|
|
"1081 0.000567 0.931828 0.000567 0.064202 0.000567 0.000567 0.000567 \n",
|
|
"1443 0.008555 0.931588 0.008548 0.008556 0.008549 0.008553 0.008550 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"328 0.000156 0.000155 \n",
|
|
"2081 0.000601 0.000601 \n",
|
|
"2672 0.000862 0.000862 \n",
|
|
"389 0.004275 0.004275 \n",
|
|
"3087 0.005558 0.005560 \n",
|
|
"2478 0.006537 0.006539 \n",
|
|
"2272 0.006951 0.006949 \n",
|
|
"853 0.000350 0.064574 \n",
|
|
"1081 0.000567 0.000567 \n",
|
|
"1443 0.008550 0.008551 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2517_cleder_fastkml_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2967_mila-iqia_picklable-itertools_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2575_samtools_htsjdk.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/543_boto_s3transfer_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2308_php-fig_cache.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2772_storaged-project_libblockdev_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/304_pauldmccarthy_indexed_gzip.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1282_python-lz4_python-lz4.git_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2289_carlodefalco_octave-mpi_README.md\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n",
|
|
"3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n",
|
|
"3027 0.042031 0.000039 0.110408 0.114454 0.000039 0.015444 0.000039 \n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n",
|
|
"964 0.089514 0.000043 0.006234 0.000043 0.000043 0.071786 0.043128 \n",
|
|
"2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n",
|
|
"3072 0.582368 0.000065 0.019226 0.039125 0.000065 0.041239 0.209392 \n",
|
|
"661 0.300668 0.000066 0.010081 0.616868 0.000066 0.072053 0.000066 \n",
|
|
"3221 0.221286 0.000069 0.040534 0.620289 0.000069 0.000069 0.000069 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3795 0.131447 0.000015 \n",
|
|
"3966 0.000035 0.295755 \n",
|
|
"3027 0.717508 0.000039 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"337 0.000043 0.000043 \n",
|
|
"964 0.000043 0.789164 \n",
|
|
"2865 0.000047 0.452085 \n",
|
|
"3072 0.069255 0.039265 \n",
|
|
"661 0.000066 0.000066 \n",
|
|
"3221 0.117548 0.000069 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2056_OpenPrinting_foomatic-db.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4206_dnmfarrell_Data-FormValidator.git_README\n",
|
|
"-----------------------Topic 2 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"672 0.000091 0.000091 0.999271 0.000091 0.000091 0.000091 0.000091 \n",
|
|
"3181 0.000152 0.000152 0.998782 0.000152 0.000152 0.000152 0.000152 \n",
|
|
"3929 0.000966 0.000967 0.992265 0.000967 0.000967 0.000967 0.000967 \n",
|
|
"1475 0.001039 0.001039 0.991690 0.001039 0.001039 0.001039 0.001039 \n",
|
|
"1309 0.001059 0.001059 0.991530 0.001059 0.001059 0.001059 0.001059 \n",
|
|
"4022 0.001079 0.001079 0.991367 0.001079 0.001079 0.001079 0.001079 \n",
|
|
"3212 0.001112 0.001112 0.991106 0.001112 0.001112 0.001112 0.001112 \n",
|
|
"2521 0.001123 0.001124 0.991016 0.001123 0.001123 0.001123 0.001123 \n",
|
|
"2024 0.001134 0.001134 0.990927 0.001134 0.001135 0.001134 0.001134 \n",
|
|
"2147 0.001146 0.001146 0.990834 0.001146 0.001146 0.001146 0.001146 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"672 0.000091 0.000091 \n",
|
|
"3181 0.000152 0.000152 \n",
|
|
"3929 0.000967 0.000967 \n",
|
|
"1475 0.001039 0.001039 \n",
|
|
"1309 0.001059 0.001059 \n",
|
|
"4022 0.001079 0.001079 \n",
|
|
"3212 0.001112 0.001112 \n",
|
|
"2521 0.001123 0.001123 \n",
|
|
"2024 0.001134 0.001134 \n",
|
|
"2147 0.001146 0.001146 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/666_RobertBeckebans_RBDOOM-3-BFG_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1439_pdfminer_pdfminer.six.git_README.html\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1191_backuppc_backuppc-xs.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3536_jkeenan_extutils-modulemaker.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/898_adrianlopezroche_fdupes.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/767_knik0_faad2.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3331_ClusterLabs_fence-agents_README.licence\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/139_seattlerb_ruby_parser.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2854_faye_faye_README.txt\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n",
|
|
"3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n",
|
|
"3648 0.540425 0.023001 0.000038 0.245106 0.034566 0.000038 0.000038 \n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n",
|
|
"2913 0.000048 0.195954 0.000048 0.019170 0.723177 0.003377 0.058128 \n",
|
|
"1537 0.313891 0.089990 0.000049 0.023406 0.134488 0.000049 0.047071 \n",
|
|
"1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n",
|
|
"802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n",
|
|
"10 0.000851 0.259910 0.000064 0.008255 0.000064 0.000845 0.723237 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3795 0.131447 0.000015 \n",
|
|
"3966 0.000035 0.295755 \n",
|
|
"3648 0.000038 0.156749 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"2865 0.000047 0.452085 \n",
|
|
"2913 0.000048 0.000048 \n",
|
|
"1537 0.000049 0.391008 \n",
|
|
"1149 0.000052 0.000052 \n",
|
|
"802 0.000055 0.000055 \n",
|
|
"10 0.006711 0.000064 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3091_AFLplusplus_AFLplusplus.git_README\n",
|
|
"-----------------------Topic 3 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"4184 0.000413 0.000413 0.000413 0.996694 0.000413 0.000413 0.000413 \n",
|
|
"634 0.000585 0.000585 0.000585 0.995319 0.000585 0.000585 0.000585 \n",
|
|
"541 0.001059 0.001059 0.001059 0.991531 0.001058 0.001059 0.001059 \n",
|
|
"4086 0.001069 0.001069 0.001069 0.991448 0.001069 0.001069 0.001069 \n",
|
|
"337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n",
|
|
"1082 0.000108 0.000108 0.038632 0.960609 0.000108 0.000108 0.000108 \n",
|
|
"3436 0.000214 0.000214 0.000214 0.953378 0.000214 0.045126 0.000214 \n",
|
|
"3099 0.000483 0.000483 0.000483 0.945309 0.030232 0.000483 0.000483 \n",
|
|
"168 0.000204 0.000204 0.016534 0.944392 0.037848 0.000204 0.000204 \n",
|
|
"805 0.000420 0.000419 0.000420 0.939900 0.057163 0.000420 0.000419 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"4184 0.000413 0.000413 \n",
|
|
"634 0.000585 0.000585 \n",
|
|
"541 0.001058 0.001059 \n",
|
|
"4086 0.001069 0.001069 \n",
|
|
"337 0.000043 0.000043 \n",
|
|
"1082 0.000108 0.000108 \n",
|
|
"3436 0.000214 0.000214 \n",
|
|
"3099 0.000483 0.021559 \n",
|
|
"168 0.000204 0.000204 \n",
|
|
"805 0.000419 0.000420 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3251_eproxus_meck.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1044_wolever_parameterized_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2204_easystats_parameters.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4060_perl5-utils_Params-Util_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/629_hamcrest_PyHamcrest_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4064_testing-cabal_mock_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3817_webmozart_assert_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1432_fluxx_exam_README.rst\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n",
|
|
"3355 0.044710 0.035340 0.279984 0.000040 0.000040 0.019913 0.122916 \n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"964 0.089514 0.000043 0.006234 0.000043 0.000043 0.071786 0.043128 \n",
|
|
"2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n",
|
|
"3870 0.312524 0.017558 0.065654 0.000051 0.032472 0.000051 0.098971 \n",
|
|
"3366 0.196719 0.041714 0.043868 0.000061 0.309832 0.000061 0.306935 \n",
|
|
"1239 0.133374 0.866158 0.000067 0.000067 0.000067 0.000067 0.000067 \n",
|
|
"2205 0.306048 0.030212 0.382652 0.000073 0.018162 0.071704 0.032824 \n",
|
|
"1183 0.281059 0.000077 0.000077 0.000077 0.000077 0.007434 0.000077 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3966 0.000035 0.295755 \n",
|
|
"3355 0.333160 0.163897 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"964 0.000043 0.789164 \n",
|
|
"2865 0.000047 0.452085 \n",
|
|
"3870 0.455920 0.016799 \n",
|
|
"3366 0.000061 0.100750 \n",
|
|
"1239 0.000067 0.000067 \n",
|
|
"2205 0.045686 0.112640 \n",
|
|
"1183 0.000077 0.711047 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/934_AlDanial_cloc.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1926_darold_ora2pg.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1611_arno-iptables-firewall_aif.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/554_memtest86plus_memtest86plus.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2225_JamesHeinrich_getID3_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/362_mate-desktop_marco.git_README\n",
|
|
"-----------------------Topic 4 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"331 0.000467 0.000467 0.000467 0.000467 0.996263 0.000467 0.000467 \n",
|
|
"3569 0.000601 0.000601 0.000601 0.000601 0.995193 0.000601 0.000601 \n",
|
|
"2222 0.000717 0.000717 0.000717 0.000717 0.994262 0.000717 0.000717 \n",
|
|
"3364 0.000761 0.000761 0.000762 0.000762 0.993908 0.000761 0.000761 \n",
|
|
"659 0.000806 0.000805 0.000805 0.000807 0.993555 0.000806 0.000805 \n",
|
|
"4125 0.000849 0.000849 0.000848 0.000849 0.993211 0.000849 0.000849 \n",
|
|
"3191 0.000975 0.000976 0.000975 0.000975 0.992198 0.000975 0.000975 \n",
|
|
"1342 0.000992 0.000992 0.000993 0.000992 0.992061 0.000992 0.000992 \n",
|
|
"3762 0.001090 0.001090 0.001089 0.001089 0.991284 0.001089 0.001089 \n",
|
|
"1350 0.001135 0.001135 0.001134 0.001135 0.990924 0.001135 0.001135 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"331 0.000467 0.000467 \n",
|
|
"3569 0.000601 0.000601 \n",
|
|
"2222 0.000717 0.000717 \n",
|
|
"3364 0.000762 0.000761 \n",
|
|
"659 0.000806 0.000806 \n",
|
|
"4125 0.000849 0.000848 \n",
|
|
"3191 0.000975 0.000975 \n",
|
|
"1342 0.000992 0.000992 \n",
|
|
"3762 0.001089 0.001090 \n",
|
|
"1350 0.001134 0.001135 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3145_tkem_mopidy-dleyna.git_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2600_spyder-ide_qtpy_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/953_rroemhild_flask-ldapconn_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3185_mopidy_mopidy-scrobbler.git_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2310_tduehr_omniauth-cas3_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2822_gawel_panoramisk.git_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3648_tkem_mopidy-podcast-itunes_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3969_kingosticks_mopidy-tunein_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3103_erdewit_nest_asyncio_README.rst\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n",
|
|
"3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n",
|
|
"3027 0.042031 0.000039 0.110408 0.114454 0.000039 0.015444 0.000039 \n",
|
|
"3355 0.044710 0.035340 0.279984 0.000040 0.000040 0.019913 0.122916 \n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n",
|
|
"964 0.089514 0.000043 0.006234 0.000043 0.000043 0.071786 0.043128 \n",
|
|
"2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n",
|
|
"1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n",
|
|
"802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3795 0.131447 0.000015 \n",
|
|
"3966 0.000035 0.295755 \n",
|
|
"3027 0.717508 0.000039 \n",
|
|
"3355 0.333160 0.163897 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"337 0.000043 0.000043 \n",
|
|
"964 0.000043 0.789164 \n",
|
|
"2865 0.000047 0.452085 \n",
|
|
"1149 0.000052 0.000052 \n",
|
|
"802 0.000055 0.000055 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/934_AlDanial_cloc.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
|
|
"-----------------------Topic 5 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3370 0.000229 0.000229 0.000229 0.000229 0.000229 0.998166 0.000229 \n",
|
|
"1017 0.000333 0.000333 0.000333 0.000333 0.000333 0.997338 0.000333 \n",
|
|
"3174 0.000423 0.000423 0.000423 0.000423 0.000423 0.996619 0.000423 \n",
|
|
"717 0.001710 0.001711 0.001710 0.001710 0.001711 0.986318 0.001710 \n",
|
|
"4132 0.002268 0.002269 0.002268 0.002269 0.002269 0.981849 0.002269 \n",
|
|
"3 0.002778 0.002779 0.002779 0.002780 0.002779 0.977767 0.002779 \n",
|
|
"1555 0.002925 0.002925 0.002924 0.002925 0.002925 0.976601 0.002926 \n",
|
|
"1336 0.003969 0.003970 0.003969 0.003972 0.003970 0.968244 0.003969 \n",
|
|
"705 0.005565 0.005560 0.005561 0.005569 0.005559 0.955511 0.005559 \n",
|
|
"2220 0.006536 0.006537 0.006541 0.006540 0.006537 0.947699 0.006536 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3370 0.000229 0.000229 \n",
|
|
"1017 0.000333 0.000333 \n",
|
|
"3174 0.000423 0.000423 \n",
|
|
"717 0.001710 0.001710 \n",
|
|
"4132 0.002269 0.002269 \n",
|
|
"3 0.002779 0.002780 \n",
|
|
"1555 0.002925 0.002925 \n",
|
|
"1336 0.003969 0.003969 \n",
|
|
"705 0.005558 0.005557 \n",
|
|
"2220 0.006537 0.006536 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2594_crsmithdev_arrow.git_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2879_jonschlinkert_map-visit_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1625_keis_base58_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/162_sebastianbergmann_comparator_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1075_sebastianbergmann_object-enumerator_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/830_npm_abbrev-js_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1722_unclechu_node-deep-extend.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2412_crosswire-bible-society_nave_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4251_hughsk_is-typedarray_README.md\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n",
|
|
"3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n",
|
|
"3648 0.540425 0.023001 0.000038 0.245106 0.034566 0.000038 0.000038 \n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n",
|
|
"2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n",
|
|
"1537 0.313891 0.089990 0.000049 0.023406 0.134488 0.000049 0.047071 \n",
|
|
"3870 0.312524 0.017558 0.065654 0.000051 0.032472 0.000051 0.098971 \n",
|
|
"1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n",
|
|
"802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3795 0.131447 0.000015 \n",
|
|
"3966 0.000035 0.295755 \n",
|
|
"3648 0.000038 0.156749 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"337 0.000043 0.000043 \n",
|
|
"2865 0.000047 0.452085 \n",
|
|
"1537 0.000049 0.391008 \n",
|
|
"3870 0.455920 0.016799 \n",
|
|
"1149 0.000052 0.000052 \n",
|
|
"802 0.000055 0.000055 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1926_darold_ora2pg.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
|
|
"-----------------------Topic 6 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"1893 0.004116 0.004116 0.004116 0.004116 0.004119 0.004116 0.967068 \n",
|
|
"2605 0.000456 0.000456 0.000456 0.000456 0.000456 0.031675 0.965135 \n",
|
|
"2308 0.004630 0.004630 0.004630 0.004630 0.004634 0.004630 0.962953 \n",
|
|
"2446 0.004630 0.004630 0.004630 0.004630 0.004634 0.004630 0.962953 \n",
|
|
"4152 0.004630 0.004630 0.004630 0.004630 0.004634 0.004630 0.962953 \n",
|
|
"371 0.005053 0.005053 0.005053 0.005051 0.005053 0.005052 0.959573 \n",
|
|
"1589 0.006946 0.006946 0.006947 0.006945 0.006947 0.006945 0.944423 \n",
|
|
"3111 0.006950 0.006946 0.006947 0.006950 0.006950 0.006950 0.944408 \n",
|
|
"3995 0.056876 0.000100 0.000100 0.000100 0.000100 0.000100 0.942423 \n",
|
|
"3135 0.007414 0.007415 0.007409 0.007414 0.007411 0.007411 0.940706 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"1893 0.004116 0.004117 \n",
|
|
"2605 0.000456 0.000456 \n",
|
|
"2308 0.004630 0.004631 \n",
|
|
"2446 0.004630 0.004631 \n",
|
|
"4152 0.004630 0.004631 \n",
|
|
"371 0.005052 0.005059 \n",
|
|
"1589 0.006949 0.006953 \n",
|
|
"3111 0.006952 0.006948 \n",
|
|
"3995 0.000100 0.000100 \n",
|
|
"3135 0.007410 0.007411 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1607_ppentchev_feature-check_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2577_kodi-pvr_pvr.dvbviewer.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2363_kodi-pvr_pvr.hdhomerun.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3414_kodi-pvr_pvr.njoy.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3527_hackerschoice_THC-Archive_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2455_tcolar_wmfrog_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2103_aperezdc_signify.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/119_linux-thinkpad_tp_smapi_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3587_igaw_jitterdebugger.git_README\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3597 0.520753 0.030707 0.076996 0.009554 0.064448 0.004423 0.000021 \n",
|
|
"3648 0.540425 0.023001 0.000038 0.245106 0.034566 0.000038 0.000038 \n",
|
|
"3027 0.042031 0.000039 0.110408 0.114454 0.000039 0.015444 0.000039 \n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n",
|
|
"1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n",
|
|
"802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n",
|
|
"4047 0.672539 0.165141 0.000064 0.028149 0.043231 0.000064 0.000064 \n",
|
|
"661 0.300668 0.000066 0.010081 0.616868 0.000066 0.072053 0.000066 \n",
|
|
"1239 0.133374 0.866158 0.000067 0.000067 0.000067 0.000067 0.000067 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3597 0.000021 0.293076 \n",
|
|
"3648 0.000038 0.156749 \n",
|
|
"3027 0.717508 0.000039 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"337 0.000043 0.000043 \n",
|
|
"1149 0.000052 0.000052 \n",
|
|
"802 0.000055 0.000055 \n",
|
|
"4047 0.090683 0.000064 \n",
|
|
"661 0.000066 0.000066 \n",
|
|
"1239 0.000067 0.000067 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3624_audacity_audacity.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2523_jeancroy_fuzzaldrin-plus_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/554_memtest86plus_memtest86plus.git_README.md\n",
|
|
"-----------------------Topic 7 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"2445 0.000363 0.000363 0.000363 0.000363 0.000363 0.000363 0.000363 \n",
|
|
"1971 0.001001 0.001002 0.001001 0.001002 0.001001 0.001002 0.001001 \n",
|
|
"2288 0.001635 0.001635 0.001634 0.001635 0.001635 0.001636 0.001635 \n",
|
|
"52 0.002138 0.002139 0.002138 0.002140 0.002138 0.002138 0.002138 \n",
|
|
"2276 0.023188 0.000155 0.000155 0.000155 0.000155 0.000155 0.000155 \n",
|
|
"1682 0.000772 0.000772 0.000772 0.000772 0.039993 0.000772 0.000772 \n",
|
|
"3082 0.009264 0.009261 0.009262 0.009265 0.009266 0.009262 0.009269 \n",
|
|
"1117 0.084798 0.000643 0.000642 0.000643 0.000643 0.000642 0.000643 \n",
|
|
"2480 0.000185 0.000185 0.000185 0.000185 0.000185 0.000185 0.000185 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"2445 0.997094 0.000363 \n",
|
|
"1971 0.991989 0.001001 \n",
|
|
"2288 0.986920 0.001635 \n",
|
|
"52 0.982894 0.002138 \n",
|
|
"2276 0.975728 0.000155 \n",
|
|
"1682 0.954604 0.000772 \n",
|
|
"3082 0.925882 0.009269 \n",
|
|
"1117 0.910704 0.000642 \n",
|
|
"2480 0.909884 0.088824 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3104_scrapy-plugins_scrapy-djangoitem_README.rst\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1445_carljm_django-model-utils.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/864_bfirsh_django-ordered-model.git_README.markdown\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3002_ionelmc_python-darkslide.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4196_alexott_muse_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/995_coleifer_wtf-peewee_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/811_sopel-irc_sopel.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1019_jazzband_django-sortedm2m.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/356_wanderlust_apel_README.en\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3597 0.520753 0.030707 0.076996 0.009554 0.064448 0.004423 0.000021 \n",
|
|
"3966 0.029730 0.000035 0.000035 0.000035 0.000035 0.000035 0.674307 \n",
|
|
"3648 0.540425 0.023001 0.000038 0.245106 0.034566 0.000038 0.000038 \n",
|
|
"337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n",
|
|
"964 0.089514 0.000043 0.006234 0.000043 0.000043 0.071786 0.043128 \n",
|
|
"2865 0.508832 0.000047 0.000047 0.000047 0.000047 0.000047 0.038802 \n",
|
|
"2913 0.000048 0.195954 0.000048 0.019170 0.723177 0.003377 0.058128 \n",
|
|
"1537 0.313891 0.089990 0.000049 0.023406 0.134488 0.000049 0.047071 \n",
|
|
"1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n",
|
|
"802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3597 0.000021 0.293076 \n",
|
|
"3966 0.000035 0.295755 \n",
|
|
"3648 0.000038 0.156749 \n",
|
|
"337 0.000043 0.000043 \n",
|
|
"964 0.000043 0.789164 \n",
|
|
"2865 0.000047 0.452085 \n",
|
|
"2913 0.000048 0.000048 \n",
|
|
"1537 0.000049 0.391008 \n",
|
|
"1149 0.000052 0.000052 \n",
|
|
"802 0.000055 0.000055 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3624_audacity_audacity.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3572_Xastir_Xastir.git_README.1ST\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1212_smbolton_whysynth.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/963_OpenTTD_OpenTTD.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1620_kilobyte_qjoypad_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3471_iortcw_iortcw.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
|
|
"-----------------------Topic 8 --------------------------------\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"140 0.000299 0.000299 0.000299 0.000299 0.000299 0.000299 0.000299 \n",
|
|
"444 0.000475 0.000475 0.000475 0.000475 0.000475 0.000475 0.000475 \n",
|
|
"124 0.000517 0.000517 0.000517 0.000517 0.000517 0.000517 0.000517 \n",
|
|
"2319 0.000559 0.000559 0.000559 0.000559 0.000559 0.000559 0.000559 \n",
|
|
"3911 0.000564 0.000564 0.000564 0.000564 0.000564 0.000564 0.000564 \n",
|
|
"2132 0.000621 0.000621 0.000621 0.000621 0.000621 0.000621 0.000621 \n",
|
|
"75 0.000635 0.000635 0.000635 0.000635 0.000635 0.000635 0.000635 \n",
|
|
"529 0.000635 0.000635 0.000635 0.000635 0.000635 0.000635 0.000635 \n",
|
|
"2202 0.000642 0.000642 0.000642 0.000643 0.000642 0.000643 0.000643 \n",
|
|
"3084 0.000642 0.000642 0.000642 0.000643 0.000642 0.000643 0.000643 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"140 0.000299 0.997609 \n",
|
|
"444 0.000475 0.996200 \n",
|
|
"124 0.000517 0.995864 \n",
|
|
"2319 0.000559 0.995531 \n",
|
|
"3911 0.000565 0.995485 \n",
|
|
"2132 0.000621 0.995032 \n",
|
|
"75 0.000635 0.994919 \n",
|
|
"529 0.000635 0.994919 \n",
|
|
"2202 0.000643 0.994860 \n",
|
|
"3084 0.000643 0.994860 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3815_swami_swami_README.OSX\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3202_extensions_dune-grid-glue_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2977_airspy_airspyone_host.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2136_wxMaxima-developers_wxmaxima_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2068_math-comp_math-comp_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2814_MaartenBaert_ssr_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2954_core_dune-localfunctions_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1140_core_dune-geometry_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/305_staging_dune-typetree_README.GIT\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/4205_staging_dune-functions_README\n",
|
|
" 0 1 2 3 4 5 6 \\\n",
|
|
"3795 0.002233 0.000015 0.000015 0.248725 0.000015 0.000015 0.617519 \n",
|
|
"3027 0.042031 0.000039 0.110408 0.114454 0.000039 0.015444 0.000039 \n",
|
|
"3251 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 0.000040 \n",
|
|
"337 0.000043 0.000042 0.033554 0.966149 0.000042 0.000042 0.000043 \n",
|
|
"2913 0.000048 0.195954 0.000048 0.019170 0.723177 0.003377 0.058128 \n",
|
|
"1149 0.000052 0.650270 0.000052 0.349365 0.000052 0.000052 0.000052 \n",
|
|
"802 0.000055 0.850160 0.000055 0.149458 0.000055 0.000055 0.000055 \n",
|
|
"10 0.000851 0.259910 0.000064 0.008255 0.000064 0.000845 0.723237 \n",
|
|
"4047 0.672539 0.165141 0.000064 0.028149 0.043231 0.000064 0.000064 \n",
|
|
"661 0.300668 0.000066 0.010081 0.616868 0.000066 0.072053 0.000066 \n",
|
|
"\n",
|
|
" 7 8 \n",
|
|
"3795 0.131447 0.000015 \n",
|
|
"3027 0.717508 0.000039 \n",
|
|
"3251 0.999676 0.000040 \n",
|
|
"337 0.000043 0.000043 \n",
|
|
"2913 0.000048 0.000048 \n",
|
|
"1149 0.000052 0.000052 \n",
|
|
"802 0.000055 0.000055 \n",
|
|
"10 0.006711 0.000064 \n",
|
|
"4047 0.090683 0.000064 \n",
|
|
"661 0.000066 0.000066 \n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2616_bluca_gsl_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3140_resurrecting-open-source-projects_txt2html_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3148_enova_pg_fact_loader.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/800_jazzband_inflect.git_README.txt\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3159_epam_nfstrace_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2080_mrabarnett_mrab-regex_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3900_EsotericSoftware_kryo.git_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3091_AFLplusplus_AFLplusplus.git_README\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2523_jeancroy_fuzzaldrin-plus_README.md\n",
|
|
"/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1675_ronsavage_GraphViz.git_README\n",
|
|
"0 0.073153\n",
|
|
"1 0.097928\n",
|
|
"2 0.096995\n",
|
|
"3 0.076255\n",
|
|
"4 0.141707\n",
|
|
"5 0.072689\n",
|
|
"6 0.093970\n",
|
|
"7 0.072439\n",
|
|
"8 0.274864\n",
|
|
"dtype: float64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"prevalent_topics(data_vectorized, file_list)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|