Some improvements to run affinity clustering on larger dataset and

compute density.
2020-12-12 20:42:47 -08:00
parent e6294b5b90
commit 56269deee3
15 changed files with 84 additions and 84 deletions
--- a/density/Makefile
+++ b/density/Makefile
@@ -0,0 +1,7 @@
+all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather
+
+/gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
+	python3 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum
+
+/gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
+	python3 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum
--- a/density/overlap_density.py
+++ b/density/overlap_density.py
@@ -0,0 +1,57 @@
+import pandas as pd
+from pandas.core.groupby import DataFrameGroupBy as GroupBy
+import fire
+import numpy as np
+
+def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
+    df = pd.read_feather(inpath)
+    df = df.drop('subreddit',1)
+    np.fill_diagonal(df.values,0)
+    df = agg(df, 0).reset_index()
+    df = df.rename({0:'overlap_density'},axis='columns')
+    df.to_feather(outpath)
+    return df
+
+def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
+    df = pd.read_parquet(inpath)
+    # exclude the diagonal
+    df = df.loc[df.subreddit != df.variable]
+    res = agg(df.groupby(['subreddit','week'])).reset_index()
+    res.to_feather(outpath)
+    return res
+
+def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
+                           outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
+    if type(agg) == str:
+        agg = eval(agg)
+
+    overlap_density(inpath, outpath, agg)
+
+def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
+                         outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
+
+    if type(agg) == str:
+        agg = eval(agg)
+
+    overlap_density(inpath, outpath, agg)
+
+def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
+                                  outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
+    if type(agg) == str:
+        agg = eval(agg)
+
+    overlap_density_weekly(inpath, outpath, agg)
+
+def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
+                                outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
+    if type(agg) == str:
+        agg = eval(agg)
+
+    overlap_density_weekly(inpath, outpath, agg)
+
+
+if __name__ == "__main__":
+    fire.Fire({'authors':author_overlap_density,
+               'terms':term_overlap_density,
+               'author_weekly':author_overlap_density_weekly,
+               'term_weekly':term_overlap_density_weekly})