social-media-chapter/code/topic_modeling/01_make_paper_files.py

'''Creates the figures and tables for LaTeX'''

import pandas as pd
import numpy as np
import datetime
import argparse
import os

topic_names = [
    'Media Use',
    'Social Network Analysis',
    'Consumer Analsyis',
    'Education',
    'Quantitative Analysis',
    'Information Spread',
    'Health',
    'Sentiment Analysis',
    'News',
    'HCI',
    'Influence',
    'Methodology'
]

def main():

    parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics')
    parser.add_argument('-a', help='Abstracts LDA file',
            default='processed_data/abstracts_LDA.csv')
    parser.add_argument('-w', help='Top words file',
            default='processed_data/top_words.csv')
    parser.add_argument('-t', help='Topic tables directory',
            default='paper/tables/')
    parser.add_argument('-o', help = 'RData output file location',
            default = 'paper/data/topic_model_data.RData')

    args = parser.parse_args()

    # Make the top_words tables
    tw = pd.read_csv(args.w)
    # Add names
    tw.columns = topic_names
    # Save as 2 different tables, because they are too long
    if not os.path.exists(args.t):
        os.makedirs(args.t)
    tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6])
    tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:])

    # Load the abstracts and topics data
    df = pd.read_csv(args.a)
    n_topics = len(tw.columns)
    # Change to datetime
    df.date = pd.to_datetime(df.date)

    # Remove papers from 2016 since we don't have the entire year, so graphs are misleading
    df = df[df.date <= pd.to_datetime('2015-12-31')]
    df = df.set_index('date')
    # Rename the last columns as the topic names
    df.columns = list(df.columns[:-n_topics]) + topic_names
    # Group by year, and get only the LDA columns
    topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]]
    # Get summary statistics for each topic
    # Total amount published in each topic by year
    topic_sums = topics_by_year.sum()
    # Mean amount published in each topic
    topic_means = topics_by_year.mean()
    # Now, we weight the contributions by how much a paper has been cited.
    # Remember, each document has a distribution of topics that it belongs to, so a given document might look like:
    # T1: .5
    # T2: .3
    # T3: 0
    # T4: .2
    # To account for how influential a paper is, we take all of the topic columns for a document
    # and multiplies their weights by the logged citations the paper has received.
    citation_weighted_topics = df[df.columns[-n_topics:]]
    citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0)
    weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum()

    ## write data to R
    # import code to write r modules and create our variable we'll write to
    import rpy2.robjects as robjects
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()


    r = {'weighted_sums' : weighted_sums,
         'topic_sums' : topic_sums,
         'topic_means' : topic_means }

    for var_name, x in r.items():
        robjects.r.assign(var_name.replace("_", "."), x)

    if not os.path.exists(os.path.dirname(args.o)):
        os.makedirs(os.path.dirname(args.o))

    robjects.r('save({},file = "{}")'.format(
                                            ",".join([k.replace("_", ".") for k in r.keys()]),
                                            args.o
                                            ))
    robjects.r("rm(list=ls())")


if __name__ == '__main__':
    main()