cdsc_reddit/density/overlap_density.py

import pandas as pd
from pandas.core.groupby import DataFrameGroupBy as GroupBy
import fire
import numpy as np
import sys
sys.path.append("..")
sys.path.append("../similarities")
from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval

# this is the mean of the ratio of the overlap to the focal size.
# mean shared membership per focal community member
# the input is the author tf-idf matrix

def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
    df = pd.read_feather(inpath)
    df = df.drop('subreddit',1)
    np.fill_diagonal(df.values,0)
    df = agg(df, 0).reset_index()
    df = df.rename({0:'overlap_density'},axis='columns')
    df.to_feather(outpath)
    return df

def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
    df = pd.read_parquet(inpath)
    # exclude the diagonal
    df = df.loc[df.subreddit != df.variable]
    res = agg(df.groupby(['subreddit','week'])).reset_index()
    res.to_feather(outpath)
    return res


# inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet";
# min_df=1;
# included_subreddits=None;
# topN=10000;
# outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather"

# to_date=2019-10-28


def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
                           outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
    if type(agg) == str:
        agg = eval(agg)

    overlap_density(inpath, outpath, agg)

def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
                         outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):

    if type(agg) == str:
        agg = eval(agg)

    overlap_density(inpath, outpath, agg)

def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
                                  outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
    if type(agg) == str:
        agg = eval(agg)

    overlap_density_weekly(inpath, outpath, agg)

def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
                                outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
    if type(agg) == str:
        agg = eval(agg)

    overlap_density_weekly(inpath, outpath, agg)


if __name__ == "__main__":
    fire.Fire({'authors':author_overlap_density,
               'terms':term_overlap_density,
               'author_weekly':author_overlap_density_weekly,
               'term_weekly':term_overlap_density_weekly})