In [1]:
import re
import numpy as np
import pandas as pd
import glob
import copy
import csv
from statistics import mean, median
from strip_markdown import strip_markdown
import joblib

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from statistics import mode

In [15]:
#import nltk
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/SOC.NORTHWESTERN.EDU/nws8519/nltk_data...


True

In [3]:
# spacy and nltk for lemmatization
import nltk 
#nltk.download('stopwords')
import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = stopwords.words('english')

In [4]:
def metadata_for_file(file):
    word_list = file.split()
    word_count = len(word_list)
    #print(word_list)
    if word_count == 0:
        avg_word_length = 0
    else: 
        avg_word_length = sum(map(len, word_list))  / len(word_list)
    #return number of paragraphs
    return word_count, avg_word_length

In [5]:
def get_data_from_dir(directory):
    files = glob.glob(f"{directory}/*")
    data_list = []
    word_counts = []
    avg_word_lengths = []
    file_list = []
    for file in files:
        text = open(file, encoding='utf-8', errors='ignore').read()
        #here's some of the descriptive text analysis
        word_count, avg_word_length = metadata_for_file(text)
        word_counts.append(word_count)
        avg_word_lengths.append(avg_word_length)
        #adding the data to the list of text
        data_list.append(text)
        #adding filename
        file_list.append(file)
    return data_list, word_counts, avg_word_lengths, file_list

In [6]:
#preprocessing text data
def preprocess(corpus_list):
    #extending stopwords 
    specific_stopwords = ["http", "com", "www", "org", "file", "code", "time", "software", "use", "user", "set", "line", "run", "source", "github",
    "lineno", "python", "php", "ruby", "api"]
    stopwords.extend(specific_stopwords)
    D = copy.copy(corpus_list)
    #stripping markdown from documents
    D = [strip_markdown(doc) for doc in D]
    #strip html 
    D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]
    #mvp right now, can certainly be expanded as iterations of text analysis are done
    D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]
    lemmatizer = WordNetLemmatizer()
    D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]
    return D_lemma

In [7]:
#preparing processed data for model usage
def text_preparation(lemmatized_text):
    #bigrams
    D_bigrams = copy.copy(lemmatized_text)
    bigram = Phrases(D_bigrams, min_count=2)
    for i in range(len(lemmatized_text)):
        for token in bigram[D_bigrams[i]]:
            if '_' in token:
                D_bigrams[i].append(token)
    #id2word
    id2word = corpora.Dictionary(D_bigrams)
    id2word.filter_extremes(no_below=5, no_above=0.5)
    #bow representation 
    bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]
    return bag_of_words, id2word

In [8]:
def lda_model_identification(data_vectorized):
    lda = LatentDirichletAllocation()
    search_params = {'n_components': [11], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256]  }
    model = GridSearchCV(lda, param_grid=search_params, verbose=10)
    model.fit(data_vectorized)
    best_lda_model = model.best_estimator_
    print("Best Model's Params: ", model.best_params_)
    print("Best Log Likelihood Score: ", model.best_score_)
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [18]:
def best_lda_model(data_vectorized, vocab):
    lda = LatentDirichletAllocation(n_components=11, learning_decay = 0.9, batch_size = 256, max_iter = 50)
    id_topic = lda.fit_transform(data_vectorized)
    topic_words = {}
    for topic, comp in enumerate(lda.components_):
        word_idx = np.argsort(comp)[::-1][:10]
        topic_words[topic] = [vocab[i] for i in word_idx]
    for topic, words in topic_words.items():
        print('Topic: %d' % topic)
        print('  %s' % ', '.join(words))
    #lda.print_topics(num_words=10)
    joblib.dump(lda, '020325_README_lda.jl')
    #lda = joblib.load('0509_lda.jl')
    return id_topic

In [10]:
def get_most_prevalent(vect_documents, documents):
    lda = joblib.load('020325_README_lda.jl')
    distributions = lda.transform(vect_documents)
    most_prevalent = {0: [0, ""],1: [0, ""], 2: [0, ""], 3: [0, ""], 4: [0, ""], 5: [0, ""], 6: [0, ""], 7: [0, ""], 8: [0, ""], 9: [0, ""], 10: [0, ""]}
    for i, topic_distribution in enumerate(distributions):
        for j in range(11):
            if topic_distribution[j] > most_prevalent[j][0]:
                most_prevalent[j] = [topic_distribution[j], documents[i]]
    print(most_prevalent)
    return most_prevalent


In [23]:
def prevalent_topics(vect_documents, file_list):
    lda = joblib.load('020325_README_lda.jl')
    #lda = joblib.load('0514_contrib_lda.jl')
    distributions = lda.transform(vect_documents)
    #figuring out what the max distribution is and then figuring out the mode
    top_topic = []
    count_of_multiple = 0
    topic_arrays = []
    for i, topic_distribution in enumerate(distributions):
        max_dist = max(topic_distribution)
        indexes = np.where(topic_distribution == max_dist)[0]
        if len(indexes) == 1:
            top_topic.append(indexes[0])
        else:
            count_of_multiple += 1
        topic_arrays.append(topic_distribution)
    #most_frequent(top_topic)
    print(count_of_multiple)
    df = pd.DataFrame(topic_arrays)
    #finding the distribution values for all documents
    with open('020325_README_file_topic_distributions.csv', 'w', newline='') as csvfile:
        fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, row in df.iterrows():
            project_dir =  {}
            project_dir['filename'] = file_list[i].split("/")[-1]
            array_row = df.iloc[i].to_numpy()
            for j in range(11):
                project_dir["t" + str(j)] = array_row[j]
            writer.writerow(project_dir)
    #print(df.sort_values(by=['0']).head(5))
    for i in range(11):
        print("-----------------------Topic " + str(i) + " --------------------------------")
        top5 = df.nlargest(10, i)
        top_indices = top5.index.to_list()
        print(top5)
        for index in top_indices:
            print(file_list[index])
        bottom5 = df.nsmallest(10, i)
        bottom_indices = bottom5.index.to_list()
        print(bottom5)
        for index in bottom_indices:
            print(file_list[index])
    averages = df.mean()
    print(averages)


In [12]:
def most_frequent(topic_prevalence):
    most_frequent_array = []
    for j in range(11):
        topic = mode(topic_prevalence)
        most_frequent_array.append(topic)
        topic_prevalence = [i for i in topic_prevalence if i != topic]
    print(most_frequent_array)

In [13]:
readme_directory = "/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/"

In [14]:
listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(readme_directory)
print("Mean wordcount: ", mean(wordcounts))
print("Median wordcount: ", median(wordcounts))
print("Mean wordlength: ", mean(wordlengths))
print("Median wordlength: ", median(wordlengths))
lemmatized_corpus = preprocess(listed_corpus)

Mean wordcount:  271.6877796091359
Median wordcount:  98
Mean wordlength:  6.063122274716372
Median wordlength:  5.841269841269841


  k = self.parse_starttag(i)


In [15]:

'''
vectorizer = CountVectorizer(analyzer='word',       
                         min_df=2,                        
                         stop_words='english',             
                         lowercase=True,                   
                         token_pattern='[a-zA-Z0-9]{2,}',  
                        )
data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
joblib.dump(vectorizer, '020325_README_vectorizer.joblib')
'''


['020325_README_vectorizer.joblib']

In [16]:
vectorizer = joblib.load('020325_README_vectorizer.joblib')
data_vectorized = vectorizer.transform(lemmatized_corpus)  

In [17]:
lda_model_identification(data_vectorized)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........
[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1005863.489 total time=  10.2s
[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........
[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1011357.156 total time=  10.0s
[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........
[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-1015386.424 total time=  10.0s
[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........
[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-965023.515 total time=  10.3s
[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=11.........
[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=11;, score=-994223.612 total 

In [19]:
topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())

Topic: 0
  test, library, object, google, include, class, interface, using, build, example
Topic: 1
  server, client, option, command, network, device, port, support, interface, default
Topic: 2
  value, function, string, data, object, return, type, table, method, error
Topic: 3
  install, build, make, package, configure, debian, git, need, directory, gnome
Topic: 4
  obj, filter, stream, length, type, page, count, parent, max, resource
Topic: 5
  window, mode, color, game, key, menu, default, size, button, sound
Topic: 6
  file, directory, path, install, make, command, default, version, option, usr
Topic: 7
  license, version, gnu, http, public, free, general, copyright, project, install
Topic: 8
  model, django, url, module, password, key, import, request, date, add
Topic: 9
  library, file, version, make, module, perl, support, makefile, image, program
Topic: 10
  html, git, copyright, license, copy, text, json, example, new, install


In [21]:
get_most_prevalent(data_vectorized, file_list)

{0: [0.9998131703476353, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf'], 1: [0.9936580635354768, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README'], 2: [0.9992995657213791, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_muffin.git_hullabaloo_README'], 3: [0.988192939654375, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chewing_scim-chewing.git_hullabaloo_README'], 4: [0.9964897891037261, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_webmail.git_hullabaloo_README'], 5: [0.9943880112670485, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines-activemodel_hullabaloo_README.md'], 6: [0.99975972937

{0: [0.9998131703476353,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/sqlmapproject_sqlmap.git_hullabaloo_README.pdf'],
 1: [0.9936580635354768,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/kjn_lbzip2_hullabaloo_README'],
 2: [0.9992995657213791,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/linuxmint_muffin.git_hullabaloo_README'],
 3: [0.988192939654375,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/chewing_scim-chewing.git_hullabaloo_README'],
 4: [0.9964897891037261,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/horde_webmail.git_hullabaloo_README'],
 5: [0.9943880112670485,
  '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme/state-machines_state_machines-activemodel_hullabaloo_README.md'],


In [24]:
prevalent_topics(data_vectorized, file_list)

349
-----------------------Topic 0 --------------------------------
            0         1         2         3         4         5         6   \
3551  0.984847  0.001515  0.001515  0.001515  0.001515  0.001515  0.001515   
3413  0.981059  0.001894  0.001894  0.001894  0.001894  0.001894  0.001894   
3396  0.973259  0.002674  0.002674  0.002674  0.002674  0.002675  0.002674   
1240  0.965032  0.003497  0.003497  0.003497  0.003497  0.003497  0.003497   
946   0.960470  0.003953  0.003953  0.003953  0.003953  0.003953  0.003953   
2914  0.958673  0.004133  0.004132  0.004133  0.004133  0.004132  0.004133   
225   0.954660  0.000918  0.000918  0.000918  0.000918  0.000918  0.000918   
2355  0.943176  0.005683  0.005682  0.005682  0.005682  0.005682  0.005682   
2913  0.943019  0.016762  0.000654  0.000654  0.000654  0.000654  0.000654   
901   0.942914  0.001421  0.001420  0.044301  0.001421  0.001421  0.001421   

            7         8         9         10  
3551  0.001515  0.001515  