initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
89
code/prediction/01-build_control_variables.R
Normal file
89
code/prediction/01-build_control_variables.R
Normal file
@@ -0,0 +1,89 @@
|
||||
source("code/prediction/utils.R")
|
||||
|
||||
# use this to store things for use in the paper
|
||||
pred.descrip <- NULL
|
||||
|
||||
abstracts <- read.delim("processed_data/abstracts.tsv", header=TRUE,
|
||||
stringsAsFactors=FALSE, sep="\t")
|
||||
|
||||
abstracts <- subset(abstracts, select = -abstract)
|
||||
|
||||
abstracts <- abstracts[abstracts$aggregation_type != "Trade Journal" &
|
||||
is.na(abstracts$aggregation_type) == FALSE, ]
|
||||
|
||||
names(abstracts)[names(abstracts) == 'num_citations'] <- 'works_cited'
|
||||
abstracts$works_cited[is.na(abstracts$works_cited) == TRUE] <- 0
|
||||
|
||||
# affiliations
|
||||
affiliations <- read.delim("processed_data/paper_aff_table.tsv",
|
||||
header=TRUE, stringsAsFactors=FALSE,
|
||||
sep="\t")
|
||||
|
||||
# eliminate missing values
|
||||
affiliations <- affiliations[!is.na(affiliations$affiliation_id) &
|
||||
affiliations$organization != "", ]
|
||||
|
||||
|
||||
remap.affiliations <- function(aff.id,
|
||||
aff.df = affiliations){
|
||||
org.modal <- names(tail(sort(table(affiliations$organization[
|
||||
affiliations$affiliation_id == aff.id])),1))
|
||||
return(org.modal)
|
||||
}
|
||||
|
||||
affiliations$organization <- sapply(affiliations$affiliation_id, remap.affiliations)
|
||||
|
||||
affiliations <- subset(affiliations, select = c(paper_eid,
|
||||
organization))
|
||||
names(affiliations) <- c("eid", "affiliation")
|
||||
|
||||
# need to remove repeat affiliations
|
||||
affiliations <- affiliations[duplicated(affiliations$eid) == FALSE,]
|
||||
|
||||
|
||||
######################################
|
||||
d <- abstracts[, c("eid", "language", "modal_country",
|
||||
"source_title", "works_cited")]
|
||||
|
||||
# dichotomous dependent variable
|
||||
d$cited <- abstracts$cited_by_count > 0
|
||||
|
||||
|
||||
# store this here for use in the paper before we run any restrictions:
|
||||
pred.descrip$cited <- d$cited
|
||||
pred.descrip$cites <- abstracts$cited_by_count
|
||||
|
||||
|
||||
# We want these to be categorical variables
|
||||
d$modal_country <- factor(d$modal_country)
|
||||
d$language <- factor(d$language)
|
||||
d$subject <- factor(abstracts$first_ASJC_subject_area)
|
||||
d$source_title <- factor(d$source_title)
|
||||
d$month <- factor(strftime(abstracts$date, format= "%m"))
|
||||
# except for pub year - keep that continuous
|
||||
d$year <- as.numeric(strftime(abstracts$date, format="%Y"))
|
||||
|
||||
# bring in org affiliations
|
||||
d <- merge(d, affiliations, by="eid") # note that this drops papers
|
||||
# w/out org info
|
||||
|
||||
d$affiliation <- factor(d$affiliation)
|
||||
|
||||
##### Restrictions:
|
||||
|
||||
### do this explicitly so that changes are easy:
|
||||
d <- restrict(d, d$affiliation, 1)
|
||||
d <- restrict(d, d$subject, 1)
|
||||
d <- restrict(d, d$source_title, 1)
|
||||
d <- restrict(d, d$language, 1)
|
||||
d <- restrict(d, d$modal_country, 1)
|
||||
|
||||
# n.authors
|
||||
# per author prior citations
|
||||
|
||||
pred.descrip$covars <- d
|
||||
save(pred.descrip, file = "paper/data/prediction_descriptives.RData")
|
||||
|
||||
|
||||
rm(d, abstracts, affiliations)
|
||||
|
||||
Reference in New Issue
Block a user