We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
104 lines
3.6 KiB
Python
104 lines
3.6 KiB
Python
'''Creates the figures and tables for LaTeX'''
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import datetime
|
|
import argparse
|
|
import os
|
|
|
|
topic_names = [
|
|
'Media Use',
|
|
'Social Network Analysis',
|
|
'Consumer Analsyis',
|
|
'Education',
|
|
'Quantitative Analysis',
|
|
'Information Spread',
|
|
'Health',
|
|
'Sentiment Analysis',
|
|
'News',
|
|
'HCI',
|
|
'Influence',
|
|
'Methodology'
|
|
]
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(description='Takes the LDA info and top words and creates an RData file with summary statistics')
|
|
parser.add_argument('-a', help='Abstracts LDA file',
|
|
default='processed_data/abstracts_LDA.csv')
|
|
parser.add_argument('-w', help='Top words file',
|
|
default='processed_data/top_words.csv')
|
|
parser.add_argument('-t', help='Topic tables directory',
|
|
default='paper/tables/')
|
|
parser.add_argument('-o', help = 'RData output file location',
|
|
default = 'paper/data/topic_model_data.RData')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Make the top_words tables
|
|
tw = pd.read_csv(args.w)
|
|
# Add names
|
|
tw.columns = topic_names
|
|
# Save as 2 different tables, because they are too long
|
|
if not os.path.exists(args.t):
|
|
os.makedirs(args.t)
|
|
tw.to_latex(args.t + 'topic_words1.tex',index=False, columns=tw.columns[:6])
|
|
tw.to_latex(args.t + 'topic_words2.tex',index=False, columns=tw.columns[6:])
|
|
|
|
# Load the abstracts and topics data
|
|
df = pd.read_csv(args.a)
|
|
n_topics = len(tw.columns)
|
|
# Change to datetime
|
|
df.date = pd.to_datetime(df.date)
|
|
|
|
# Remove papers from 2016 since we don't have the entire year, so graphs are misleading
|
|
df = df[df.date <= pd.to_datetime('2015-12-31')]
|
|
df = df.set_index('date')
|
|
# Rename the last columns as the topic names
|
|
df.columns = list(df.columns[:-n_topics]) + topic_names
|
|
# Group by year, and get only the LDA columns
|
|
topics_by_year = df.groupby(lambda x: x.year)[df.columns[-n_topics:]]
|
|
# Get summary statistics for each topic
|
|
# Total amount published in each topic by year
|
|
topic_sums = topics_by_year.sum()
|
|
# Mean amount published in each topic
|
|
topic_means = topics_by_year.mean()
|
|
# Now, we weight the contributions by how much a paper has been cited.
|
|
# Remember, each document has a distribution of topics that it belongs to, so a given document might look like:
|
|
# T1: .5
|
|
# T2: .3
|
|
# T3: 0
|
|
# T4: .2
|
|
# To account for how influential a paper is, we take all of the topic columns for a document
|
|
# and multiplies their weights by the logged citations the paper has received.
|
|
citation_weighted_topics = df[df.columns[-n_topics:]]
|
|
citation_weighted_topics = citation_weighted_topics.apply(lambda x: x * np.log1p(df.cited_by_count), axis=0)
|
|
weighted_sums = citation_weighted_topics.groupby(lambda x: x.year).sum()
|
|
|
|
## write data to R
|
|
# import code to write r modules and create our variable we'll write to
|
|
import rpy2.robjects as robjects
|
|
from rpy2.robjects import pandas2ri
|
|
pandas2ri.activate()
|
|
|
|
|
|
r = {'weighted_sums' : weighted_sums,
|
|
'topic_sums' : topic_sums,
|
|
'topic_means' : topic_means }
|
|
|
|
for var_name, x in r.items():
|
|
robjects.r.assign(var_name.replace("_", "."), x)
|
|
|
|
if not os.path.exists(os.path.dirname(args.o)):
|
|
os.makedirs(os.path.dirname(args.o))
|
|
|
|
robjects.r('save({},file = "{}")'.format(
|
|
",".join([k.replace("_", ".") for k in r.keys()]),
|
|
args.o
|
|
))
|
|
robjects.r("rm(list=ls())")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|