In [1]:
import re
import numpy as np
import pandas as pd
import glob
import copy
import csv
from statistics import mean, median
from strip_markdown import strip_markdown
import joblib

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from statistics import mode

from collections import defaultdict

In [3]:
#import nltk
#nltk.download('wordnet')

In [4]:
# spacy and nltk for lemmatization
import nltk 
#nltk.download('stopwords')
import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = stopwords.words('english')

In [15]:
def metadata_for_file(file):
    word_list = file.split()
    word_count = len(word_list)
    #print(word_list)
    if word_count == 0:
        avg_word_length = 0
    else: 
        avg_word_length = sum(map(len, word_list))  / len(word_list)
    #return number of paragraphs
    return word_count, avg_word_length, word_list

In [17]:
def get_data_from_dir(directory):
    files = glob.glob(f"{directory}/*")
    data_list = []
    word_counts = []
    avg_word_lengths = []
    file_list = []
    files_word_lists = defaultdict(list)
    for file in files:
        text = open(file, encoding='utf-8', errors='ignore').read()
        #here's some of the descriptive text analysis
        word_count, avg_word_length, word_list = metadata_for_file(text)
        word_counts.append(word_count)
        avg_word_lengths.append(avg_word_length)
        #adding the data to the list of text
        if word_count > 0:
            files_word_lists[tuple(word_list)].append(file)
        data_list.append(text)
        #adding filename
        file_list.append(file)
    return data_list, word_counts, avg_word_lengths, file_list, 

In [7]:
#preprocessing text data
def preprocess(corpus_list):
    #extending stopwords 
    specific_stopwords = ["http", "com", "www", "org", "file", "code", "time", "software", "use", "user", "set", "line", "run", "source", "github",
    "lineno", "python", "php", "ruby", "api"]
    stopwords.extend(specific_stopwords)
    D = copy.copy(corpus_list)
    #stripping markdown from documents
    D = [strip_markdown(doc) for doc in D]
    #strip html 
    D = [re.sub(r'<[^<]+?>', '', doc, flags=re.DOTALL) for doc in D]
    #mvp right now, can certainly be expanded as iterations of text analysis are done
    D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]
    lemmatizer = WordNetLemmatizer()
    D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]
    return D_lemma

In [8]:
#preparing processed data for model usage
def text_preparation(lemmatized_text):
    #bigrams
    D_bigrams = copy.copy(lemmatized_text)
    bigram = Phrases(D_bigrams, min_count=2)
    for i in range(len(lemmatized_text)):
        for token in bigram[D_bigrams[i]]:
            if '_' in token:
                D_bigrams[i].append(token)
    #id2word
    id2word = corpora.Dictionary(D_bigrams)
    id2word.filter_extremes(no_below=5, no_above=0.5)
    #bow representation 
    bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]
    return bag_of_words, id2word

In [9]:
def lda_model_identification(data_vectorized):
    lda = LatentDirichletAllocation()
    search_params = {'n_components': [9], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256]  }
    model = GridSearchCV(lda, param_grid=search_params, verbose=10)
    model.fit(data_vectorized)
    best_lda_model = model.best_estimator_
    print("Best Model's Params: ", model.best_params_)
    print("Best Log Likelihood Score: ", model.best_score_)
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [22]:
def best_lda_model(data_vectorized, vocab):
    lda = LatentDirichletAllocation(n_components=9, learning_decay = 0.7, batch_size = 128, max_iter = 50)
    id_topic = lda.fit_transform(data_vectorized)
    topic_words = {}
    for topic, comp in enumerate(lda.components_):
        word_idx = np.argsort(comp)[::-1][:10]
        topic_words[topic] = [vocab[i] for i in word_idx]
    for topic, words in topic_words.items():
        print('Topic: %d' % topic)
        print('  %s' % ', '.join(words))
    #lda.print_topics(num_words=10)
    joblib.dump(lda, '020325_README_lda.jl')
    #lda = joblib.load('0509_lda.jl')
    return id_topic

In [31]:
def get_most_prevalent(vect_documents, documents):
    lda = joblib.load('020725_README_lda.jl')
    distributions = lda.transform(vect_documents)
    most_prevalent = {0: [0, ""],1: [0, ""], 2: [0, ""], 3: [0, ""], 4: [0, ""], 5: [0, ""], 6: [0, ""], 7: [0, ""], 8: [0, ""]}
    for i, topic_distribution in enumerate(distributions):
        for j in range(9):
            if topic_distribution[j] > most_prevalent[j][0]:
                most_prevalent[j] = [topic_distribution[j], documents[i]]
    print(most_prevalent)
    return most_prevalent


In [35]:
def prevalent_topics(vect_documents, file_list):
    lda = joblib.load('020725_README_lda.jl')
    #lda = joblib.load('0514_contrib_lda.jl')
    distributions = lda.transform(vect_documents)
    #figuring out what the max distribution is and then figuring out the mode
    top_topic = []
    count_of_multiple = 0
    topic_arrays = []
    for i, topic_distribution in enumerate(distributions):
        max_dist = max(topic_distribution)
        indexes = np.where(topic_distribution == max_dist)[0]
        if len(indexes) == 1:
            top_topic.append(indexes[0])
        else:
            count_of_multiple += 1
        topic_arrays.append(topic_distribution)
    #most_frequent(top_topic)
    print(count_of_multiple)
    df = pd.DataFrame(topic_arrays)
    #finding the distribution values for all documents
    with open('020725_README_file_topic_distributions.csv', 'w', newline='') as csvfile:
        fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, row in df.iterrows():
            project_dir =  {}
            project_dir['filename'] = file_list[i].split("/")[-1]
            array_row = df.iloc[i].to_numpy()
            for j in range(9):
                project_dir["t" + str(j)] = array_row[j]
            writer.writerow(project_dir)
    #print(df.sort_values(by=['0']).head(5))
    for i in range(9):
        print("-----------------------Topic " + str(i) + " --------------------------------")
        top5 = df.nlargest(10, i)
        top_indices = top5.index.to_list()
        print(top5)
        for index in top_indices:
            print(file_list[index])
        bottom5 = df.nsmallest(10, i)
        bottom_indices = bottom5.index.to_list()
        print(bottom5)
        for index in bottom_indices:
            print(file_list[index])
    averages = df.mean()
    print(averages)


In [12]:
def most_frequent(topic_prevalence):
    most_frequent_array = []
    for j in range(11):
        topic = mode(topic_prevalence)
        most_frequent_array.append(topic)
        topic_prevalence = [i for i in topic_prevalence if i != topic]
    print(most_frequent_array)

In [13]:
readme_directory = "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/"

In [18]:
listed_corpus, wordcounts, wordlengths, file_list= get_data_from_dir(readme_directory)
print("Mean wordcount: ", mean(wordcounts))
print("Median wordcount: ", median(wordcounts))
print("Mean wordlength: ", mean(wordlengths))
print("Median wordlength: ", median(wordlengths))
lemmatized_corpus = preprocess(listed_corpus)

Mean wordcount:  324.0929957406531
Median wordcount:  156.0
Mean wordlength:  6.354120246310486
Median wordlength:  5.950514528900827


  k = self.parse_starttag(i)


In [None]:

'''
vectorizer = CountVectorizer(analyzer='word',       
                         min_df=2,                        
                         stop_words='english',             
                         lowercase=True,                   
                         token_pattern='[a-zA-Z0-9]{2,}',  
                        )
data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
joblib.dump(vectorizer, '020725_README_vectorizer.joblib')
'''


['020725_README_vectorizer.joblib']

In [20]:
vectorizer = joblib.load('020725_README_vectorizer.joblib')
data_vectorized = vectorizer.transform(lemmatized_corpus)  

In [21]:
lda_model_identification(data_vectorized)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........
[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1158862.039 total time=  17.6s
[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........
[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1121276.805 total time=  12.0s
[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........
[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1058330.478 total time=  12.6s
[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........
[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1169073.807 total time=  12.7s
[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=9..........
[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=9;, score=-1308701.275 total tim

In [23]:
topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())

Topic: 0
  image, data, key, file, color, option, support, format, default, mode
Topic: 1
  data, test, library, object, implementation, support, packet, used, byte, class
Topic: 2
  license, copyright, perl, gnu, free, version, module, public, general, warranty
Topic: 3
  test, value, function, return, method, class, string, type, object, example
Topic: 4
  http, git, server, install, client, request, test, version, project, command
Topic: 5
  json, node, require, string, parser, var, object, parse, function, font
Topic: 6
  command, output, option, process, make, program, script, tool, file, linux
Topic: 7
  table, html, tag, text, django, xml, example, path, template, default
Topic: 8
  install, make, build, library, version, directory, file, package, window, project


In [33]:
get_most_prevalent(data_vectorized, file_list)

{0: [0.9963399069190733, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes'], 1: [0.9987558745140913, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README'], 2: [0.999271074201955, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt'], 3: [0.9966940236237574, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md'], 4: [0.9962628678061417, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst'], 5: [0.998166117886522, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst'], 6: [0.9670683884278027, '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md'], 7: [0.9996764637160757, '/data/users/mgaughan/kkex/012825_cam_re

{0: [0.9963399069190733,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/2711_klines.git_README.themes'],
 1: [0.9987558745140913,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1976_batmand.git_README'],
 2: [0.999271074201955,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1001_dhewm_dhewm3.git_README.txt'],
 3: [0.9966940236237574,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/3097_sharplispers_split-sequence_README.md'],
 4: [0.9962628678061417,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/516_boto_boto3_README.rst'],
 5: [0.998166117886522,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/674_barseghyanartur_transliterate_README.rst'],
 6: [0.9670683884278027,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/1617_kodi-pvr_pvr.iptvsimple.git_README.md'],
 7: [0.9996764637160757,
  '/data/users/mgau

In [36]:
prevalent_topics(data_vectorized, file_list)

183
-----------------------Topic 0 --------------------------------
             0         1         2         3         4         5         6  \
3142  0.996340  0.000457  0.000458  0.000458  0.000458  0.000457  0.000457   
810   0.995085  0.000614  0.000614  0.000614  0.000614  0.000614  0.000614   
3064  0.983533  0.002058  0.002058  0.002059  0.002059  0.002058  0.002059   
2980  0.960597  0.000512  0.035817  0.000512  0.000513  0.000512  0.000512   
197   0.892184  0.000950  0.000951  0.000950  0.000950  0.101164  0.000950   
131   0.867562  0.001765  0.001764  0.036755  0.001764  0.001765  0.001765   
3694  0.864345  0.001390  0.001390  0.001389  0.001390  0.049473  0.001390   
582   0.857786  0.000105  0.064223  0.043669  0.000105  0.000105  0.000105   
3026  0.851801  0.018529  0.018519  0.018536  0.018522  0.018532  0.018522   
1647  0.851778  0.018530  0.018519  0.018530  0.018528  0.018532  0.018540   

             7         8  
3142  0.000458  0.000458  
810   0.000614  0.0