1
0
Files
social-media-chapter/code/topic_modeling/01_make_paper_files.py
Benjamin Mako Hill dd420c77de initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes
API keys, data files, and other material we can't share.
2018-01-21 17:15:51 -08:00

104 lines
3.6 KiB
Python

'''Creates the figures and tables for LaTeX'''
import pandas as pd
import numpy as np
import datetime
import argparse
import os
topic_names = [
'Media Use',
'Social Network Analysis',
'Consumer Analsyis',
'Education',
'Quantitative Analysis',
'Information Spread',
'Health',
'Sentiment Analysis',
'News',
'HCI',
'Influence',
'Methodology'
]
def main():
parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics')
parser.add_argument('-a', help='Abstracts LDA file',
default='processed_data/abstracts_LDA.csv')
parser.add_argument('-w', help='Top words file',
default='processed_data/top_words.csv')
parser.add_argument('-t', help='Topic tables directory',
default='paper/tables/')
parser.add_argument('-o', help = 'RData output file location',
default = 'paper/data/topic_model_data.RData')
args = parser.parse_args()
# Make the top_words tables
tw = pd.read_csv(args.w)
# Add names
tw.columns = topic_names
# Save as 2 different tables, because they are too long
if not os.path.exists(args.t):
os.makedirs(args.t)
tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6])
tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:])
# Load the abstracts and topics data
df = pd.read_csv(args.a)
n_topics = len(tw.columns)
# Change to datetime
df.date = pd.to_datetime(df.date)
# Remove papers from 2016 since we don't have the entire year, so graphs are misleading
df = df[df.date <= pd.to_datetime('2015-12-31')]
df = df.set_index('date')
# Rename the last columns as the topic names
df.columns = list(df.columns[:-n_topics]) + topic_names
# Group by year, and get only the LDA columns
topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]]
# Get summary statistics for each topic
# Total amount published in each topic by year
topic_sums = topics_by_year.sum()
# Mean amount published in each topic
topic_means = topics_by_year.mean()
# Now, we weight the contributions by how much a paper has been cited.
# Remember, each document has a distribution of topics that it belongs to, so a given document might look like:
# T1: .5
# T2: .3
# T3: 0
# T4: .2
# To account for how influential a paper is, we take all of the topic columns for a document
# and multiplies their weights by the logged citations the paper has received.
citation_weighted_topics = df[df.columns[-n_topics:]]
citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0)
weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum()
## write data to R
# import code to write r modules and create our variable we'll write to
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = {'weighted_sums' : weighted_sums,
'topic_sums' : topic_sums,
'topic_means' : topic_means }
for var_name, x in r.items():
robjects.r.assign(var_name.replace("_", "."), x)
if not os.path.exists(os.path.dirname(args.o)):
os.makedirs(os.path.dirname(args.o))
robjects.r('save({},file = "{}")'.format(
",".join([k.replace("_", ".") for k in r.keys()]),
args.o
))
robjects.r("rm(list=ls())")
if __name__ == '__main__':
main()