13
0

git-annex in nathante@mox2.hyak.local:/gscratch/comdata/users/nathante/cdsc-reddit

This commit is contained in:
Nate E TeBlunthuis 2020-11-17 16:31:48 -08:00
parent f8ff8b2d0f
commit 1bf206d219
5 changed files with 25 additions and 22 deletions

View File

@ -1,35 +1,34 @@
import fire
import pyarrow import pyarrow
import pandas as pd import pandas as pd
from numpy import random from numpy import random
import numpy as np import numpy as np
from sklearn.manifold import TSNE from sklearn.manifold import TSNE
df = pd.read_feather("reddit_term_similarity_3000.feather") similarities = "term_similarities_10000.feather"
df = df.sort_values(['i','j'])
n = max(df.i.max(),df.j.max()) def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20):
'''
similarities: feather file with a dataframe of similarity scores
learning_rate: parameter controlling how fast the model converges. Too low and you get outliers. Too high and you get a ball.
perplexity: number of neighbors to use. the default of 50 is often good.
def zero_pad(grp): '''
p = grp.shape[0] df = pd.read_feather(similarities)
grp = grp.sort_values('j')
return np.concatenate([np.zeros(n-p),np.ones(1),np.array(grp.value)])
col_names = df.sort_values('j').loc[:,['subreddit_j']].drop_duplicates() n = df.shape[0]
first_name = list(set(df.subreddit_i) - set(df.subreddit_j))[0] mat = np.array(df.drop('subreddit',1),dtype=np.float64)
col_names = [first_name] + list(col_names.subreddit_j) mat[range(n),range(n)] = 1
mat = df.groupby('i').apply(zero_pad) mat[mat > 1] = 1
mat.loc[n] = np.concatenate([np.zeros(n),np.ones(1)])
mat = np.stack(mat)
mat = mat + np.tril(mat.transpose(),k=-1)
dist = 2*np.arccos(mat)/np.pi dist = 2*np.arccos(mat)/np.pi
tsne_model = TSNE(2,learning_rate=750,perplexity=50,n_iter=10000,metric='precomputed',early_exaggeration=20,n_jobs=-1) tsne_model = TSNE(2,learning_rate=750,perplexity=50,n_iter=10000,metric='precomputed',early_exaggeration=20,n_jobs=-1)
tsne_fit_model = tsne_model.fit(dist) tsne_fit_model = tsne_model.fit(dist)
tsne_fit_whole = tsne_fit_model.fit_transform(dist) tsne_fit_whole = tsne_fit_model.fit_transform(dist)
plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':col_names}) plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit})
plot_data.to_feather("tsne_subreddit_fit.feather") plot_data.to_feather(output)
if __name__ == "__main__":
fire.Fire(fit_tsne)

View File

@ -0,0 +1 @@
../../.git/annex/objects/Qk/wG/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784

View File

@ -0,0 +1 @@
../../.git/annex/objects/w7/2f/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e

View File

@ -0,0 +1 @@
../../.git/annex/objects/WX/v3/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543

View File

@ -0,0 +1 @@
../../.git/annex/objects/mq/2z/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf