In [2]:
import re
import numpy as np
import pandas as pd
import glob
import copy
import csv
from statistics import mean, median
from strip_markdown import strip_markdown
import joblib

In [3]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from statistics import mode

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/SOC.NORTHWESTERN.EDU/nws8519/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# spacy and nltk for lemmatization
import nltk 
#nltk.download('stopwords')
import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = stopwords.words('english')

In [6]:
def metadata_for_file(file):
    word_list = file.split()
    word_count = len(word_list)
    #print(word_list)
    if word_count == 0:
        avg_word_length = 0
    else: 
        avg_word_length = sum(map(len, word_list))  / len(word_list)
    #return number of paragraphs
    return word_count, avg_word_length

In [9]:
def get_data_from_dir(directory):
    files = glob.glob(f"{directory}/*")
    data_list = []
    word_counts = []
    avg_word_lengths = []
    file_list = []
    for file in files:
        text = open(file, encoding='utf-8').read()
        #here's some of the descriptive text analysis
        word_count, avg_word_length = metadata_for_file(text)
        word_counts.append(word_count)
        avg_word_lengths.append(avg_word_length)
        #adding the data to the list of text
        data_list.append(text)
        #adding filename
        file_list.append(file)
    return data_list, word_counts, avg_word_lengths, file_list

In [10]:
#preprocessing text data
def preprocess(corpus_list):
    #extending stopwords 
    specific_stopwords = ["http", "com", "www", "org", "file", "code", "time", "software", "use", "user", "set", "line", "run", "source", "github",
    "lineno", "python", "php", "ruby", "api"]
    stopwords.extend(specific_stopwords)
    D = copy.copy(corpus_list)
    #stripping markdown from documents
    D = [strip_markdown(doc) for doc in D]
    #strip html 
    D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]
    #mvp right now, can certainly be expanded as iterations of text analysis are done
    D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]
    lemmatizer = WordNetLemmatizer()
    D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]
    return D_lemma

In [11]:
#preparing processed data for model usage
def text_preparation(lemmatized_text):
    #bigrams
    D_bigrams = copy.copy(lemmatized_text)
    bigram = Phrases(D_bigrams, min_count=2)
    for i in range(len(lemmatized_text)):
        for token in bigram[D_bigrams[i]]:
            if '_' in token:
                D_bigrams[i].append(token)
    #id2word
    id2word = corpora.Dictionary(D_bigrams)
    id2word.filter_extremes(no_below=5, no_above=0.5)
    #bow representation 
    bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]
    return bag_of_words, id2word

In [12]:
def lda_model_identification(data_vectorized):
    lda = LatentDirichletAllocation()
    search_params = {'n_components': [5], 'learning_decay': [.5, .7, .9], 'batch_size' : [128, 256]  }
    model = GridSearchCV(lda, param_grid=search_params, verbose=10)
    model.fit(data_vectorized)
    best_lda_model = model.best_estimator_
    print("Best Model's Params: ", model.best_params_)
    print("Best Log Likelihood Score: ", model.best_score_)
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [13]:
def best_lda_model(data_vectorized, vocab):
    lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.7, batch_size = 256, max_iter = 50)
    id_topic = lda.fit_transform(data_vectorized)
    topic_words = {}
    for topic, comp in enumerate(lda.components_):
        word_idx = np.argsort(comp)[::-1][:10]
        topic_words[topic] = [vocab[i] for i in word_idx]
    for topic, words in topic_words.items():
        print('Topic: %d' % topic)
        print('  %s' % ', '.join(words))
    #lda.print_topics(num_words=10)
    joblib.dump(lda, '020125_CONTRIBUTING_lda.jl')
    #lda = joblib.load('0509_lda.jl')
    return id_topic

In [14]:
def get_most_prevalent(vect_documents, documents):
    lda = joblib.load('020125_CONTRIBUTING_lda.jl')
    distributions = lda.transform(vect_documents)
    most_prevalent = {0: [0, ""],1: [0, ""], 2: [0, ""], 3: [0, ""], 4: [0, ""]}
    for i, topic_distribution in enumerate(distributions):
        for j in range(5):
            if topic_distribution[j] > most_prevalent[j][0]:
                most_prevalent[j] = [topic_distribution[j], documents[i]]
    print(most_prevalent)
    return most_prevalent


In [36]:
def prevalent_topics(vect_documents, file_list):
    lda = joblib.load('020125_CONTRIBUTING_lda.jl')
    #lda = joblib.load('0514_contrib_lda.jl')
    distributions = lda.transform(vect_documents)
    #figuring out what the max distribution is and then figuring out the mode
    top_topic = []
    count_of_multiple = 0
    topic_arrays = []
    for i, topic_distribution in enumerate(distributions):
        max_dist = max(topic_distribution)
        indexes = np.where(topic_distribution == max_dist)[0]
        if len(indexes) == 1:
            top_topic.append(indexes[0])
        else:
            count_of_multiple += 1
        topic_arrays.append(topic_distribution)
    #most_frequent(top_topic)
    print(count_of_multiple)
    df = pd.DataFrame(topic_arrays)
    #finding the distribution values for all documents
    with open('020125_CONTRIBUTING_file_topic_distributions.csv', 'w', newline='') as csvfile:
        fieldnames = ['filename', 't0', 't1', 't2', 't3', 't4']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, row in df.iterrows():
            project_dir =  {}
            project_dir['filename'] = file_list[i].split("/")[-1]
            array_row = df.iloc[i].to_numpy()
            for j in range(5):
                project_dir["t" + str(j)] = array_row[j]
            writer.writerow(project_dir)
    #print(df.sort_values(by=['0']).head(5))
    for i in range(5):
        print("-----------------------Topic " + str(i) + " --------------------------------")
        top5 = df.nlargest(10, i)
        top_indices = top5.index.to_list()
        print(top5)
        for index in top_indices:
            print(file_list[index])
        bottom5 = df.nsmallest(10, i)
        bottom_indices = bottom5.index.to_list()
        print(bottom5)
        for index in bottom_indices:
            print(file_list[index])
    averages = df.mean()
    print(averages)


In [15]:
def most_frequent(topic_prevalence):
    most_frequent_array = []
    for j in range(5):
        topic = mode(topic_prevalence)
        most_frequent_array.append(topic)
        topic_prevalence = [i for i in topic_prevalence if i != topic]
    print(most_frequent_array)

In [7]:
contributing_directory = "/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/"

In [16]:
listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(contributing_directory)
print("Mean wordcount: ", mean(wordcounts))
print("Median wordcount: ", median(wordcounts))
print("Mean wordlength: ", mean(wordlengths))
print("Median wordlength: ", median(wordlengths))
lemmatized_corpus = preprocess(listed_corpus)

Mean wordcount:  357.8979020979021
Median wordcount:  225
Mean wordlength:  6.345704522542385
Median wordlength:  5.778551532033426


In [17]:
'''
vectorizer = CountVectorizer(analyzer='word',       
                         min_df=2,                        
                         stop_words='english',             
                         lowercase=True,                   
                         token_pattern='[a-zA-Z0-9]{2,}',  
                        )
data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
joblib.dump(vectorizer, '020125_CONTRIBUTING_vectorizer.joblib')
'''

['020125_CONTRIBUTING_vectorizer.joblib']

In [17]:
vectorizer = joblib.load('020125_CONTRIBUTING_vectorizer.joblib')
data_vectorized = vectorizer.transform(lemmatized_corpus)  

In [23]:
#lda_model_identification(data_vectorized)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........
[CV 1/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-196851.911 total time=   2.1s
[CV 2/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........
[CV 2/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-168250.194 total time=   2.0s
[CV 3/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........
[CV 3/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-180223.622 total time=   2.0s
[CV 4/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........
[CV 4/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-183729.380 total time=   2.0s
[CV 5/5; 1/6] START batch_size=128, learning_decay=0.5, n_components=5..........
[CV 5/5; 1/6] END batch_size=128, learning_decay=0.5, n_components=5;, score=-174617.480 total time=   

In [25]:
#topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())

Topic: 0
  test, new, function, example, style, file, make, build, command, version
Topic: 1
  test, issue, request, pull, bug, http, feature, git, make, install
Topic: 2
  git, test, branch, change, commit, make, request, pull, release, master
Topic: 3
  contribution, license, project, open, submit, developer, right, contributor, sign, patch
Topic: 4
  issue, request, pull, bug, project, change, contributing, contribution, feature, open


In [18]:
topic_prevalence = get_most_prevalent(data_vectorized, file_list)

{0: [0.999495078557156, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenPrinting_cups_hullabaloo_CONTRIBUTING.txt'], 1: [0.9980153669818502, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/tantale_deprecated.git_hullabaloo_CONTRIBUTING.rst'], 2: [0.9989886873615608, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/puppetlabs_puppetlabs-firewall_hullabaloo_CONTRIBUTING.md'], 3: [0.9983908776533259, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/webcamoid_webcamoid.git_hullabaloo_CONTRIBUTING.md'], 4: [0.9980246890436791, '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/processone_pkix.git_hullabaloo_CONTRIBUTING.md']}


In [37]:
prevalent_topics(data_vectorized, file_list)

4
-----------------------Topic 0 --------------------------------
            0         1         2         3         4
536  0.999495  0.000126  0.000126  0.000127  0.000126
494  0.998076  0.000483  0.000480  0.000480  0.000481
403  0.997270  0.000682  0.000683  0.000677  0.000688
147  0.992964  0.001763  0.001779  0.001722  0.001773
564  0.992964  0.001763  0.001779  0.001722  0.001773
647  0.985526  0.013136  0.000446  0.000442  0.000450
106  0.985206  0.003688  0.003672  0.003728  0.003705
422  0.977476  0.000474  0.000469  0.000469  0.021112
502  0.967482  0.031760  0.000254  0.000251  0.000252
43   0.943894  0.001284  0.001282  0.052239  0.001301
/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/OpenPrinting_cups_hullabaloo_CONTRIBUTING.txt
/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing/osmcode_libosmium.git_hullabaloo_CONTRIBUTING.md
/data/users/mgaughan/kkex/012825_cam_revision_m

In [19]:
most_frequent(topic_prevalence)

[0, 1, 2, 3, 4]
