initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
17
code/data_processing/05_save_descriptives.R
Normal file
17
code/data_processing/05_save_descriptives.R
Normal file
@@ -0,0 +1,17 @@
|
||||
df = read.csv('processed_data/abstracts.tsv',sep='\t', strip.white=TRUE)
|
||||
df['date'] = as.Date(df$date)
|
||||
df$modal_country[df['modal_country'] == ''] <- NA
|
||||
df['year'] = format(df['date'],'%Y')
|
||||
|
||||
abstracts <- df[df['abstract'] != '',c('eid','abstract')]
|
||||
# Creates a vector of word counts, based on counting all of the groups of alphanumeric characters
|
||||
word_count <- apply(abstracts, 1, function(x) sapply(gregexpr("[[:alnum:]]+", x['abstract']), function(x) sum(x > 0)))
|
||||
|
||||
s = read.csv('processed_data/paper_subject_table.tsv', sep='\t')
|
||||
full <- merge(df,s, by.x = 'eid', by.y = 'paper_eid')
|
||||
|
||||
# zero these out before we save them so we don't save all of the abstracts.
|
||||
full['abstract'] <- NULL
|
||||
df['abstract'] <- NULL
|
||||
|
||||
save(df, abstracts, s, full, word_count, file="paper/data/orig_data_sets.RData")
|
||||
Reference in New Issue
Block a user