add note to try other tf normalization strategies.

Merge branch 'master' of code:cdsc_reddit
2022-03-31 12:17:16 -07:00 · 2021-08-03 15:03:40 -07:00 · 2021-07-28 15:32:21 -07:00 · 2021-07-28 15:32:04 -07:00 · 2021-04-22 10:46:26 -07:00 · 2021-04-22 10:38:10 -07:00
5 changed files with 40 additions and 24 deletions
--- a/clustering/Makefile
+++ b/clustering/Makefile
@@ -2,26 +2,29 @@
 srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
 similarity_data=/gscratch/comdata/output/reddit_similarity
 clustering_data=/gscratch/comdata/output/reddit_clustering
-selection_grid="--max_iter=10000 --convergence_iter=15,30,100 --preference_quantile=0.85 --damping=0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.97,0.99, --preference_quantile=0.1,0.3,0.5,0.7,0.9"
-all:$(clustering_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_authors-tf_similarities_30k.feather $(clustering_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_authors-tf_similarities_10k.feather $(clustering_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_10k.feather
+selection_grid="--max_iter=3000 --convergence_iter=15,30,100 --damping=0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.97,0.99, --preference_quantile=0.1,0.3,0.5,0.7,0.9"
+#selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
+all:$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv
+# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
+# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS

-$(clustering_data)/subreddit_comment_authors_10k.feather:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k $(selection_grid) -J 20
+$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
+	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k $(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(selection_grid) -J 20

-$(clustering_data)/subreddit_comment_terms_10k.feather:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k $(selection_grid) -J 20
+$(clustering_data)/subreddit_comment_terms_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
+	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k  $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv $(selection_grid) -J 20 

-$(clustering_data)/subreddit_authors-tf_similarities_10k.feather:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k $(selection_grid) -J 20
+$(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
+	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k  $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(selection_grid) -J 20

-$(clustering_data)/subreddit_comment_authors_30k.feather:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10
+# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
+# 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS

-$(clustering_data)/subreddit_comment_terms_30k.feather:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10
+# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py
+# 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS

-$(clustering_data)/subreddit_authors-tf_similarities_30k.feather:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8
+# $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather
+# 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8 && touch $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS


 # $(clustering_data)/subreddit_comment_authors_100k.feather:clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather
--- a/clustering/clustering.py
+++ b/clustering/clustering.py
@@ -24,7 +24,7 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000,
    preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits.
    damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author. 
    '''
-    print(f"damping:{damping}; convergenceIter:{convergence_iter}; preferenceQuantile:{preference_quantilne}")
+    print(f"damping:{damping}; convergenceIter:{convergence_iter}; preferenceQuantile:{preference_quantile}")

    preference = np.quantile(mat,preference_quantile)

--- a/clustering/selection.py
+++ b/clustering/selection.py
@@ -6,6 +6,7 @@ from dataclasses import dataclass
 from multiprocessing  import Pool, cpu_count, Array, Process
 from pathlib import Path
 from itertools import product, starmap
+import numpy as np
 import pandas as pd
 import fire
 import sys
@@ -23,15 +24,27 @@ class clustering_result:
    alt_silhouette_score:float
    name:str

-def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits,  max_iter,  outdir:Path, random_state, verbose, alt_mat):
+
+def sim_to_dist(mat):
+    dist = 1-mat
+    dist[dist < 0] = 0
+    np.fill_diagonal(dist,0)
+    return dist
+
+def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits,  max_iter,  outdir:Path, random_state, verbose, alt_mat, overwrite=False):
    if name is None:
-        name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{convergence_iter}"
+        name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
    print(name)
    sys.stdout.flush()
    outpath = outdir / (str(name) + ".feather")
    print(outpath)
    clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
-    score = silhouette_score(clustering.affinity_matrix_, clustering.labels_, metric='precomputed')
+    mat = sim_to_dist(clustering.affinity_matrix_)
+
+    score = silhouette_score(mat, clustering.labels_, metric='precomputed')
+
+    if alt_mat is not None:
+        alt_distances = sim_to_dist(alt_mat)
        alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
    
    res = clustering_result(outpath=outpath,
@@ -47,7 +60,7 @@ def do_clustering(damping, convergence_iter, preference_quantile, name, mat, sub

 # alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).

-def select_affinity_clustering(similarities, outdir, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None):
+def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None):

    damping = list(map(float,damping))
    convergence_iter = convergence_iter = list(map(int,convergence_iter))
@@ -80,8 +93,9 @@ def select_affinity_clustering(similarities, outdir, damping=[0.9], max_iter=100
    print("running clustering selection")
    clustering_data = pool.starmap(_do_clustering, hyper_grid)
    clustering_data = pd.DataFrame(list(clustering_data))
+    clustering_data.to_csv(outinfo)
+    
    return clustering_data

-
 if __name__ == "__main__":
-    fire.Fire(select_affinity_clustering)
+    x = fire.Fire(select_affinity_clustering)
--- a/dumps/pull_pushshift_comments.sh
+++ b/dumps/pull_pushshift_comments.sh
@@ -8,7 +8,5 @@ wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base
 wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url

-# starting in 2020 we use daily dumps not monthly dumps
-wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/

 ./check_comments_shas.py
--- a/similarities/TODO
+++ b/similarities/TODO
@@ -0,0 +1 @@
+Try normalizing tf by the mean or std instead of the max to avoid penalizing subreddits with very active users.
Author	SHA1	Message	Date
Nathan TeBlunthuis	53f5b8c03c	add note to try other tf normalization strategies.	2022-03-31 12:17:16 -07:00
Nathan TeBlunthuis	14ab979f59	Merge branch 'master' of code:cdsc_reddit	2021-08-03 15:03:40 -07:00
Nate E TeBlunthuis	c6122bb429	Merge branch 'master' of code:cdsc_reddit	2021-07-28 15:32:21 -07:00
Nate E TeBlunthuis	596e1ff339	no longer do we need to get daily dumps	2021-07-28 15:32:04 -07:00
Nate E TeBlunthuis	f20365c07e	Merge branch 'master' of code:cdsc_reddit	2021-04-22 10:46:26 -07:00
Nate E TeBlunthuis	34e0a0a30d	version of weekly_cosine_similarities.py from klone	2021-04-22 10:38:10 -07:00
Nate E TeBlunthuis	37dd0ef55f	bugfixes in clustering selection.	2021-04-21 16:56:25 -07:00
				`@@ -0,0 +1 @@`
				`Try normalizing tf by the mean or std instead of the max to avoid penalizing subreddits with very active users.`