18
0

Some improvements to run affinity clustering on larger dataset and

compute density.
This commit is contained in:
Nate E TeBlunthuis
2020-12-12 20:42:47 -08:00
parent e6294b5b90
commit 56269deee3
15 changed files with 84 additions and 84 deletions

View File

@@ -35,7 +35,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None,
subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
spark.stop()
weeks = list(subreddit_names.week.drop_duplicates())
d weeks = sorted(list(subreddit_names.week.drop_duplicates()))
for week in weeks:
print(f"loading matrix: {week}")
mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week)