From d259908c974d18345bc0a0a0c377d3d2817aee2d Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Tue, 7 Mar 2023 15:36:34 -0800 Subject: [PATCH] anonymize and remove latex comments --- article.Rtex | 434 +++------------------------------- bayesnets.tex | 31 +-- flowchart_recommendations.tex | 8 - 3 files changed, 36 insertions(+), 437 deletions(-) diff --git a/article.Rtex b/article.Rtex index eb0910c..57a861e 100644 --- a/article.Rtex +++ b/article.Rtex @@ -37,7 +37,6 @@ source('resources/real_data_example.R') \setcounter{secnumdepth}{3} -%Code listing style named "mystyle" \lstdefinestyle{mystyle}{ backgroundcolor=\color{backcolour}, commentstyle=\color{codegreen}, keywordstyle=\color{magenta}, @@ -55,61 +54,45 @@ source('resources/real_data_example.R') showtabs=false, tabsize=2 } - -% \usepackage[garamond]{mathdesign} - -% \usepackage[letterpaper,left=1in,right=1in,top=1in,bottom=1in]{geometry} - -% packages i use in essentially every document \usepackage{graphicx} \usepackage{enumerate} -% packages i use in many documents but leave off by default \usepackage{amsmath}%}, amsthm, amssymb} \DeclareMathOperator*{\argmin}{arg\,min} % thin space, limits underneath in displays \DeclareMathOperator*{\argmax}{arg\,max} % thin space, limits underneath in displays \usepackage{subcaption} -% import and customize urls -% \usepackage[usenames,dvipsnames]{color} -% \usepackage[breaklinks]{hyperref} - \hypersetup{colorlinks=true, linkcolor=black, citecolor=black, filecolor=blue, urlcolor=blue, unicode=true} -% add bibliographic stuff + \usepackage[american]{babel} \usepackage{csquotes} \usepackage[natbib=true, style=apa, sortcites=true, backend=biber]{biblatex} \addbibresource{Bibliography.bib} \DeclareLanguageMapping{american}{american-apa} -\defbibheading{secbib}[\bibname]{% - \section*{#1}% - \markboth{#1}{#1}% - \baselineskip 14.2pt% +\defbibheading{secbib}[\bibname]{ + \section*{#1} + \markboth{#1}{#1} + \baselineskip 14.2pt \prebibhook} \def\citepos#1{\citeauthor{#1}'s (\citeyear{#1})} \def\citespos#1{\citeauthor{#1}' (\citeyear{#1})} \newcommand\TODO[1]{\textsc{\color{red} #1}} -% I've gotten advice to make this as general as possible to attract the widest possible audience. \title{Misclassification in Automated Content Analysis Causes Bias in Regression. Can We Fix It? Yes We Can!} \shorttitle{Can We Fix It? Yes We Can!} -\authorsnames[1,2,3]{Nathan TeBlunthuis, Valerie Hase, Chung-hong Chan} -\authorsaffiliations{{{School of Information, University of Michigan},{Department of Communication Studies, Northwestern University}}, {Department of Media and Communication, LMU Munich}, {GESIS - Leibniz-Institut für Sozialwissenschaften}} -\leftheader{TeBlunthuis, Hase \& Chan} - \keywords{ Automated Content Analysis; Machine Learning; Classification Error; Attenuation Bias; Simulation; Computational Methods; Big Data; AI } \abstract{ -%We show how automated classifiers (ACs), even biased ACs without high accuracy, can be statistically useful in communication research. + Automated classifiers (ACs), often built via supervised machine learning (SML), can categorize large, statistically powerful samples of data ranging from text to images and video, and have become widely popular measurement devices in communication science and related fields. Despite this popularity, even highly accurate classifiers make errors that cause misclassification bias and misleading results in downstream analyses—unless such analyses account for these errors. As we show in a systematic literature review of SML applications, @@ -118,24 +101,24 @@ In principle, existing statistical methods can use ``gold standard'' validation We introduce and test such methods, including a new method we design and implement in the R package \texttt{misclassificationmodels}, via Monte Carlo simulations designed to reveal each method's limitations, which we also release. Based on our results, we recommend our new error correction method as it is versatile and efficient. In sum, automated classifiers, even those below common accuracy standards or making systematic misclassifications, can be useful for measurement with careful study design and appropriate error correction methods. } -% fix bug in apa7 package: https://tex.stackexchange.com/questions/645947/adding-appendices-in-toc-using-apa7-package + \begin{document} \maketitle -%\section{Introduction} -%\tableofcontents -%\clearpage + + + \emph{Automated classifiers} (ACs) based on supervised machine learning (SML) have rapidly gained popularity -as part of the \emph{automated content analysis} toolkit in communication science \citep{baden_three_2022}. With ACs, researchers can categorize large samples of text, images, video or other types of data into predefined categories \citep{scharkow_thematic_2013}. Studies for instance use SML-based classifiers to study frames \citep{burscher_teaching_2014}, tonality \citep{van_atteveldt_validity_2021}, %even ones as seemingly straightforward as sentiment \citep{van_atteveldt_validity_2021}, toxicity \citep{fortuna_toxic_2020} +as part of the \emph{automated content analysis} toolkit in communication science \citep{baden_three_2022}. With ACs, researchers can categorize large samples of text, images, video or other types of data into predefined categories \citep{scharkow_thematic_2013}. Studies for instance use SML-based classifiers to study frames \citep{burscher_teaching_2014}, tonality \citep{van_atteveldt_validity_2021}, or civility \citep{hede_toxicity_2021} in news media texts or social media posts. -%and institutional frameworks \citep{rice_machine_2021} -% TODO: restore citation to fortuna_toxic_2020 below + + However, there is increasing concern about the validity of automated content analysis for studying theories and concepts from communication science \citep{baden_three_2022, hase_computational_2022}. We add to this debate by analyzing \emph{misclassification bias}---how misclassifications by ACs distort statistical findings—unless correctly modeled \citep{fong_machine_2021}. Research areas where ACs have the greatest potential—e.g., content moderation, social media bots, affective polarization, or radicalization—are haunted by the specter of methodological questions related to misclassification bias \citep{rauchfleisch_false_2020}: How accurate must an AC be to measure a variable? Can an AC built for one context be used in another \citep{burscher_using_2015,hede_toxicity_2021}? Is comparing automated classifications to some external ground truth sufficient to claim validity? How do biases in AC-based measurements affect downstream statistical analyses \citep{millimet_accounting_2022}? -%Knowing that high classification accuracy limits the risks of misleading inference, careful researchers might use only ACs with excellent predictive performance. + Our study begins with a demonstration of misclassification bias in a real-world example based on the Perspective toxicity classifier. Next, we provide a systematic literature review of $N = 48$ studies employing SML-based text classification. @@ -145,34 +128,12 @@ Our review demonstrates a troubling lack of attention to the threats ACs introdu Our primary contribution, an effort to rescue ACs from this dismal state, is to \emph{introduce and test methods for correcting misclassification bias} \citep{carroll_measurement_2006, buonaccorsi_measurement_2010, yi_handbook_2021}. We consider three recently proposed methods: \citet{fong_machine_2021}'s generalized method of moments calibration method, \citet{zhang_how_2021}'s pseudo-likelihood models, and \citet{blackwell_unified_2017-1}'s application of imputation methods. To overcome these methods' limitations, we draw a general likelihood modeling framework from the statistical literature on measurement error \citep{carroll_measurement_2006} and tailor it to the problem of misclassification bias. Our novel implementation is the experimental R package \texttt{misclassificationmodels}. We test these four error correction methods and compare them against ignoring misclassification (the naïve approach) and refraining from automated content analysis by only using manual coding (the feasible approach). We use Monte Carlo simulations to model four prototypical situations identified by our review: Using ACs to measure either (1) an independent or (2) a dependent variable where the classifier makes misclassifications that are either (a) easy to correct (when an AC is unbiased and misclassifications are uncorrelated with covariates i.e., \emph{nonsystematic misclassification}) or (b) more difficult (when an AC is biased and misclassifications are correlated with covariates i.e., \emph{systematic misclassification}). -%The more difficult cases are important. -%As the real-data example we provide demonstrates, even modest biases in very accurate ACs can cause misleading statistical findings. -% Such biases can easily result when classifier errors affect human behavior, such as that of social media moderators \maskparencite{teblunthuis_effects_2021}. Studies using classifiers from APIs that are also used in sociotechnical systems therefore be particularly prone to to differential error, which can cause misleading statistics even when classification accuracy is high. - -% Our Supplementary Materials present numerous extensions of these scenarios. We show that none of the existing error correction methodsare effective in all scenarios. -%— multiple imputation fails in scenario 2; GMM calibration fails in scenario 1b and is not designed for scenario 2; and the pseudo-likelihood method fails in scenario 1 and in scenario 2b. When correctly applied, our likelihood modeling is the only correction method recovering the true parameters in all scenarios. %We provide our implementation as an R package. - -% , and our approach based on maximum likelihood methods \citep{carroll_measurement_2006} . - - %By doing so, we follow a handful of recent studies in which social scientists have used samples of human-labeled \emph{validation data} to account for misclassification by automated classifiers. - - % This paragraph is likely to get cut, but its useful so that we have a working outline: In what follows, we begin with an overview of automated content analysis to describe how AC-based measures can affect downstream analyses and how these errors thus threaten progress in automated text classification often used in the field of Computational Social Science (CSS). We substantiate our claims via a systematic literature review of \emph{N}=49 empirical studies employing SML for classification (see \nameref{appendix:lit.review} for details). - - -% Although the methods above are all effective in bivariate least squares regression when an AC is used to measure a covariate, validation data are error-free, and measurement error is \emph{nondifferential} (conditionally independent of the outcome given other covariates), -% these methods all have limitations in more general cases. Below, we present simulated scenarios in which each of these methods fail to recover the true parameters. - -% so long as the coders' errors are conditionally independent given observable variables. - -% In our discussion section, we provide detailed recommendations based on our literature review and our simulations. According to our simulations, even biased classifiers without high predictive performance can be useful in conjunction with appropriate validation data and error correction methods. As a result, we are optimistic about the potential of ACs and automated content analysis for communication science and related fields—if researchers correct for misclassification. Current practices of ``validating'' ACs by making misclassification rates transparent via metrics such as the F1 score, however, provide little safeguard against misclassification bias. In sum, we make a methodological contribution by introducing the often-ignored problem of misclassification bias in automated content analysis, testing error correction methods to address this problem via Monte Carlo simulations, and introducing a new method for error correction. -%The required assumptions for error correction methods are no more difficult than those already commonly adopted in traditional content analyses—and much more reasonable than the current default approach. -%This method can succeed where others fail, is easily applied by experienced regression modelers, and is straightforward to extend. Profoundly, we conclude that automated content analysis will progress not only---or even primarily---by building more accurate classifiers but by rigorous human annotation and statistical error modeling. \section{Why Misclassification is a Problem: an Example Based on the Perspective API} @@ -182,7 +143,7 @@ This inevitable misclassification causes bias in statistical inference \citep{ca As shown next, however, relying on toxicity scores created by ACs such as the Perspective API as (in-)dependent variables produces different results than using measurements created via manual annotation. To illustrate this, we use the Civil Comments dataset released in 2019 by Jigsaw, the Alphabet corporation subsidiary behind the Perspective API. Methodological details on the data and our example are available in Appendix \ref{appendix:perspective}. The dataset has \Sexpr{f(dv.example[['n.annotated.comments']])} English-language comments made on independent news sites. It also includes manual annotations of each comment concerning its toxicity (\emph{toxicity}), whether it discloses aspects of personal identity like race or ethnicity \emph{(identity disclosure)}, and the number of likes it received \emph{(number of likes)}. -%As obtaining manual annotations for 448,000 comments is impractical for all but the most well-resourced research teams, our subsequent analyses rely on the full dataset and, as a more realistic and feasible approach, a smaller random sample of \Sexpr{f(iv.sample.count)} (\Sexpr{format.percent(iv.sample.prop)}) manually annotated comments. + In addition to manual annotations of each comment, we obtained AC-based toxicity classifications from the Perspective API in November 2022. Perspective's toxicity classifier performs very well, with an accuracy of \Sexpr{format.percent(iv.example[['civil_comments_accuracies']][['toxicity_acc']])} and an F1 score of \Sexpr{round(iv.example[['civil_comments_f1s']][['toxicity_f1']],2)}. Nevertheless, if we treat human annotations as the ground-truth, the classifier makes systematic misclassifications for it is modestly biased and disproportionately misclassifies comments disclosing racial or ethnic identity as toxic (Pearson's $\rho=\Sexpr{round(dv.example[['civil_comments_cortab']]['toxicity_error','race_disclosed'],2)}$). @@ -206,25 +167,15 @@ print(p) \end{subfigure} -\caption{Bias through Misclassification: a Real-World Example Using the Perspective API and the Civil Comments Dataset. -%Figure \ref{fig:real.data.example.iv} compares a model using automatic toxicity classifications to a model using human toxicity annotations and shows that the 95\% confidence interval of the coefficient for likes contains 0. -%In Figure \ref{fig:real.data.example.dv}, a model predicting automatic toxicity classifications for toxicity detects a negative correlation between likes and toxicity that is not found when human annotations are used instead. A \Sexpr{format.percent(iv.sample.prop)} random sample of \Sexpr{f(iv.sample.count)} annotations does not provide sufficient statistical power to distinguish the false discovery from 0. -%In both examples, a random \Sexpr{format.percent(iv.sample.prop)} sample of \Sexpr{f(iv.sample.count)} annotations does not provide sufficient statistical power to distinguish the coefficient for likes from 0. Yet the methods we introduce can use this sample to model the misclassifications and obtain results close to those using the full dataset of annotations. -\label{fig:real.data.example} -} +\caption{Bias through Misclassification: a Real-World Example Using the Perspective API and the Civil Comments Dataset.\label{fig:real.data.example}} \end{figure} As shown in Figure \ref{fig:real.data.example.iv}, relying on AC-based toxicity classifications may lead researchers to reject a hypothesized direct relationship between likes and identity disclosure. Instead, the model suggests that their correlation is entirely mediated by toxicity. -%This is because the coefficient for likes is statistically indistinguishable from 0 and the coefficient for the interaction between likes and toxicity is positive and well-estimated. -In contrast, using human annotations would lead researchers to conclude a subtle positive direct relationship between likes and identity disclosure. %Using a smaller smaller sample of manually annotated data, as will often be more feasible due to limited resources, lacks sufficient statistical power to detect any such relationship. -%However, our method can use this sample of annotations to correct the bias introduced by Perspective's misclassifications while preserving enough statistical power to detect the direct relationship between likes and identity disclosure at the 95\% confidence level with estimates similar to those in the model using all \Sexpr{f(dv.example[['n.annotated.comments']])} annotations. +In contrast, using human annotations would lead researchers to conclude a subtle positive direct relationship between likes and identity disclosure. This demonstrates that even a very accurate AC can introduce type-II errors, i.e. researchers failing to rejecting a null hypothesis due to misclassification. Second, let us consider \emph{misclassification in a dependent variable}. We now predict the \emph{toxicity} of a comment with \emph{number of likes}, \emph{identity disclosure} in a comment, and their interaction as independent variables. As shown in Figure \ref{fig:real.data.example.dv}, using Perspective's classification of toxicity results in a small negative direct effect of likes. However, there is no detectable relationship when using manual annotations. As such, misclassification can also lead to type-I error, i.e., false discovery of a nonzero relationship. -%The model using a more feasible sample of \Sexpr{format.percent(dv.sample.prop) } of manual annotations cannot rule out such a weak relationship. -%(the estimated effect using the AC is in the 95\% confidence interval), but our error correction method using this sample and Perspective's automatic classifications together can do so. - \section{Why Transparency about Misclassification Is Not Enough} Although the Perspective API is certainly accurate enough to be useful to content moderators, the example above demonstrates that this does not imply usefulness for social science \citep{grimmer_machine_2021-1}. @@ -236,12 +187,6 @@ We argue that current common practices to address such limitations are insuffici These steps promote confidence in results by making misclassification transparent, but our example indicates bias can flow downstream into statistical inferences, despite high predictiveness. Instead of relying only on transparency rituals to ward off misclassification bias, researchers can and should use validation data to correct it. -% \citep{obermeyer_dissecting_2019, kleinberg_algorithmic_2018, bender_dangers_2021, wallach_big_2019, noble_algorithms_2018}. -%For example, \citet{hede_toxicity_2021} show that, when applied to news datasets, the Perspecitve API overestimates incivility related to topics such as racial identity, violence, and sex. -%These automatic classifications will likely introduce differential measurement error to a regression model of an outcome related to such topics. -%Although the effect sizes in these cases are rather subtle and would not be detectable in smaller datasets, such small effects commonly found using large datasets can easily result from subtle biases in observational study designs \citep{kaplan_big_2014}. Such small effect sizes may not appear practically or theoretically important, but note that the consequences of bias from automatic classification for coefficients in these examples (i.e., the interaction term in the first example and \emph{identity disclosure} in the second) are larger. -%Importantly, these errors are correctable using human annotations. Although this example required \Sexpr{iv.sample.count} annotations, a large number representing considerable effort, to consistently do so, this is a small fraction of the entire dataset. - These claims may surprise because of the wide-spread misconception that misclassification causes only conservative bias (i.e., bias towards null effects). This is believed because it is true for bivariate least squares regression when misclassifications are nonsystematic \citep{carroll_measurement_2006, loken_measurement_2017, van_smeden_reflection_2020}.\footnote{Measurement error is \emph{classical} when it is nonsystematic and the the variance of an AC's predictions is greater than the variance of the true value \citep{carroll_measurement_2006}. Measurement error in an independent variable is called ``differential'' if it is not conditionally independent of the dependent variable given the other independent variables. @@ -251,107 +196,57 @@ with an independent variable. We use this more general term to simplify our disc However, as shown in our example, misclassification bias can be anti-conservative \citep{carroll_measurement_2006, loken_measurement_2017, van_smeden_reflection_2020}. In regression models with more than one independent variable, or in nonlinear models, such as the logistic regression we used in our example, even nonsystematic misclassification can cause bias away from 0. Second, systematic misclassification can bias inference in any direction. -%Researchers can check the assumption of nondifferential measurement error via graphical and statistical conditional independence tests \citep{carroll_measurement_2006, fong_machine_2021}. -%Users of ACs should be especially conscious of differential error due to the nonlinear behavior of many ACs \citep{breiman_statistical_2001}. ACs designed in one context and applied in another are likely to commit systematic misclassification. For example, the Perspective API used to classify toxic content was developed for social media comments but performs much worse when applied to news data \citep{hede_toxicity_2021}. Systematic misclassification may also arise when an AC used for measurement shapes behavior in a sociotechnical system under study. As examples, the Perspective API is used for online forum moderation \citep{hede_toxicity_2021}, as is the ORES API for Wikipedia moderators \citep{teblunthuis_effects_2021}. Misclassifications from such classifiers can be systematic because they have causal effects on outcomes related to moderation. -%TODO: uncomment citation below + If ACs become standard measurement devices, for instance -%the LIWC dictionary to measure sentiment \citep{boukes_whats_2020}, -%\citep{dobbrick_enhancing_2021} Google's Perspective API for measuring toxicity \citep[see critically][]{hosseini_deceiving_2017} or Botometer for classifying social media bots \citep[see critically][]{rauchfleisch_false_2020}, entire research areas may be subject to systematic biases. Even if misclassification bias is usually conservative, it can slow progress in a research area. Consider how \citet{scharkow_how_2017} argue that media's ``minimal effects'' on political opinions and behavior in linkage studies may be an artifact of measurement errors in both manual content analyses and self-reported media use in surveys. Conversely, if researchers selectively report statistically significant hypothesis tests, misclassification can introduce an upward bias in the magnitude of reported effect sizes and contribute to a replication crisis \citep{loken_measurement_2017}. - -% First, we note that when the anticipated effect size is large enough, traditional content analysis of a random sample has the advantage over the considerable complexity of automated content analysis. -% ACs should be used when costs prohibit traditional content analysis of sample size sufficient to detect anticipated effect sizes, but where collective a relatively small sample of validation data is tractable. - -% When the data used to train an AC is not representative of the study population, as is the case with commercial APIs or other black-box classifiers, this increases the risk of differential measurement error, which can introduce extremely misleading forms of statistical bias. Even this form of error can be addressed. - - -% Therefore, we recommend reporting (and preregistering) at least two aforementioned corrective methods in addition to uncorrected estimates. When machine learning classification is used for an independent variable, we recommend multiple imputation because it is robust to differential error and it simple to implement. However, our simulations show that multiple imputation does not work well when machine learning classification is used for the dependent variable. Greater care may be required if measurement error may be differential, because specifying the error model may open many degrees of research freedom and plausible error moe \section{Quantifying the Problem: Error Correction Methods in SML-based Text Classification} -% In traditional content analysis, humans use their judgement to classify messages, and automated content analysis uses computers as an instrument to - -% % can be defined either as a research approach or as an instrument. - -% In this paper, automated content analysis is defined as a research approach, which is a sub-type of content analysis for -% In contrast to manual content analysis, the difference is that the instrument used to code messages shifts from human judgment to computer algorithms \citep{scharkow2017content}. These computer algorithms, which can also be confusingly defined as ``automated content analysis" in the instrumental sense, are called automated coding techniques (versus manual coding techniques) in this paper. - - -% Social scientists have long recognized that measurement error can be an important methodological concern, but this concern has often been neglected \citep{schwartz_neglected_1985}. - - -% There have been several papers outlining what automated coding techniques are in the "toolbox" of communication researchers (key papers are \citep{scharkow2017content} and \citep{boumans:2015:tst}). -% Unsupervised and supervised machine learning procedures are deployed for coding. -% There has been discussion on the best practices for deploying unsupervised machine learning for communication research \citep{maier:2018:ALT}. -% This paper is going to focus only on classification. -% Researchers have raised concerns about validity issues of the approach \citep{scharkow2017content}. And by definition, the coding made by this technique is an imperfect surrogate of manual coding \citep{boumans:2015:tst}. When machine-classified surrogates are used in regression analyses for ``making replicable and valid inferences from texts", measurement errors are introduced \citep{fong_machine_2021}. A formal mathematical definition of these measurement errors is available later. - -% In the next section, all communication research studies with SML are reviewed to show how researchers deals with these measurement errors. - -% Furthermore, human classifiers also make errors and none of the prior methods consider how errors in the validation data can bias statistical results \citep{geis_statistical_2021, song_validations_2020, bachl_correcting_2017, scharkow_how_2017}. - - % Changeme to bring back citations after ICA - -%Content analysis focuses on ``\emph{making replicable and valid inferences from texts (or other meaningful matter) to the contexts of their use}'' \citep[p. 24, emphasis in original]{krippendorff_content_2018}. To understand how social scientists, including communication scholars, engage with the problem of misclassification in automated content analysis, -%SML classifiers enable researchers to inexpensively measure categorical variables in large data sets. This promises to be useful for study designs requiring large samples such as to infer effect sizes smaller than would be possible using smaller samples humans could feasibly classify. -%But are scholars aware that misclassification by ACs poses threats to the validity of downstream analyses? Although such issues in the context of manual content analysis have attracted much debate \citep{bachl_correcting_2017}, this is less true for misclassification by newly popular automatic classifiers. we conducted a systematic literature review of studies using supervised machine learning (SML) for text classification (see Appendix \ref{appendix:lit.review} in our Supplement for details).\footnote{Automated content analysis includes a range of methods both for assigning content to predefined categories (e.g., dictionaries) and for assigning content to unknown categories (e.g., topic modeling) \citep{grimmer_text_2013}. While we focus on SML, our arguments extend to other approaches such as dictionary-based classification and even beyond the specific context of text classification.} Our sample consists of studies identified by similar reviews on automated content analysis \citep{baden_three_2022, hase_computational_2022, junger_unboxing_2022, song_validations_2020}. Our goal is not to comprehensively review all SML studies -%\footnote{In fact, our review likely underestimates the use of the method, as we focused on text-based SML methods in the social science domain employed for empirical analyses.} but to provide a picture of common practices, with an eye toward awareness of misclassification and its statistical implications. We identified a total of 48 empirical studies published between 2013 and 2021, more than half of which were published in communication journals. Studies used SML-based text classification for purposes such as to measure frames \citep{opperhuizen_framing_2019} or topics \citep{vermeer_online_2020}. They often employed SML-based ACs to create dichotomous (50\%) or other categorical variables (23\%).\footnote{Metric variables were created in 35\% of studies, mostly via the non-parametric method by \citet{hopkins_method_2010}.} Of these empirical studies, many used SML-based ACs as independent variables (44\%) or dependent variables (40\%) in multivariate analyses, and 90\% reported univariate statistics such as proportions. -%— from the prevalence of topics in online news \citep{vermeer_online_2020} to incivility in social media posts \citep{su_uncivil_2018} —, + Overall, our review reveals a \emph{lack of transparency when reporting SML-based text classification}, similar to that previously reported \citep{reiss_reporting_2022}: A large share of studies do not report important methodological decisions related to sampling and sizes of training and test sets (see Appendix \ref{appendix:lit.review}). This lack of transparency concerning model validation not only limits the degree to which researchers can evaluate studies, but also makes replicating such analyses to correct misclassification bias nearly impossible. Most important, our review finds that \emph{studies almost never reflected upon nor corrected misclassification bias}. According to our review, 85\% of studies reported metrics such as recall or precision, but only 19\% of studies explicitly stated that an AC misclassified texts which may introduce measurement error. Only a single article reported using error correction methods. To address the clear need for methods for understanding misclassification bias and correcting it, we now introduce and discuss existing methods to do so. -%yi_handbook_2021,buonaccorsi_measurement_2010 + \section{Addressing the Problem: Existing Approaches for Correcting Misclassification} Statisticians have extensively studied measurement error (including misclassification), the problems it causes for statistical inference, and methods for correcting these problems \citep[see][]{carroll_measurement_2006, fuller_measurement_1987}. We narrow our focus to three existing methods recently proposed for dealing with misclassification bias in the context of automated content analysis: \citet{fong_machine_2021}'s GMM calibration method, multiple imputation \citep{blackwell_unified_2017-1}, and \citet{zhang_how_2021}'s pseudo-likelihood model.\footnote{Statisticians have studied other methods including simulation extrapolation, Bayesian estimation, and score function methods. As we argue in Appendix \ref{appendix:other.methods}, these error correction methods are not advantageous when manually annotated data is available, as is often the case with ACs.} -%Measurement error is a vast and deep subject in statistics. We recommend \citet{carroll_measurement_2006} as a graduate-level textbook on the subject. + In the interest of clarity, we introduce some notation. Say we want to estimate a regression model $Y = B_0 + B_1 X + B_2 Z + \varepsilon$ where $X$ is an independent variable for which a small sample of manually annotated data $X^*$ and automated classifications $W$ are observed. Fully observed are $Z$, a second independent variable and $Y$, the dependent variable. To illustrate, in our first real-world example, $X$ is toxicity, $X^*$ are the civil comment annotations, $W$ are the Perspective API's toxicity classification, $Z$ are likes, and $Y$ is identity disclosure. - Say the sample of annotated data $X^*$ is too small to convincingly test a hypothesis, but collecting additional annotations is too expensive. In contrast, an AC can make classifications $W$ for the entire dataset but introduces misclassification bias. How can we correct this bias in an automated content analysis? \emph{Regression calibration} uses observable variables, including automated classifications $W$ and other variables measured without error $Z$, to approximate the true value of $X$ \citep{carroll_measurement_2006}. \citet{fong_machine_2021} propose a regression calibration procedure designed for SML that we refer to as \emph{GMM calibration} or GMM.\footnote{\citet{fong_machine_2021} describe their method within an instrumental variable framework, but it is equivalent to regression calibration, the standard term in measurement error literature.} For their calibration model, \citet{fong_machine_2021} use 2-stage least squares (2SLS). They regress the observed variables $Z$ and AC predictions $W$ onto the manually annotated data and then use the resulting model to approximate $X$ as $\hat{X}$. They then use the generalized method of moments (gmm) to combine estimates based on the approximated independent variable $\hat{X}$ and estimates based on the manually annotated data $X^*$. This method makes efficient use of manually annotated data and provides an asymptotic theory for deriving confidence intervals. The GMM approach does not make strong assumptions about the distribution of the outcome $Y$, but can be invalidated by systematic misclassification \citep{fong_machine_2021}. GMM, like other regression calibration techniques, is not designed to correct for misclassification in the outcome. - \emph{Multiple imputation} (MI) treats misclassification as a missing data problem. It understands the true value of $X$ to be observed in manually annotated data $X^*$ and missing otherwise \citep{blackwell_unified_2017-1}. -%For example, the regression calibration step in \citet{fong_machine_2021}'s GMM method uses least squares regression to impute unobserved values of the covariate $X$. Indeed, \citet{carroll_measurement_2006} describe regression calibration when validation data are available as ``simply a poor person's imputation methodology'' (pp. 70). + Like regression calibration, multiple imputation uses a model to infer likely values of possibly misclassified variables. The difference is that multiple imputation samples several (hence \emph{multiple} imputation) entire datasets filling in the missing data from the predictive probability distribution of $X$ conditional on other variables $\{W,Y,Z\}$, then runs a statistical analysis on each of these sampled datasets and pools the results of each of these analyses \citep{blackwell_unified_2017-1}. Note that $Y$ is included among the imputing variables, giving the MI approach the potential to address \emph{differential error,} when systematic misclassification makes automatic classifications conditionally dependent on the outcome given other independent variables. \citet{blackwell_unified_2017-1} claim that the MI method is relatively robust when it comes to small violations of the assumption of nondifferential error. Moreover, in theory, the MI approach can be used for correcting misclassifications both in independent and dependent variables. -\emph{``Pseudo-likelihood''} methods (PL)—even if not always explicitly labeled this way—are another approach for correcting misclassification bias. \citet{zhang_how_2021} proposes a method that approximates the error model using quantities from the AC's confusion matrix—the positive and negative predictive values in the case of a mismeasured independent variable and the AC's false positive and false negative rates in the case of a mismeasured dependent variable. Because quantities from the confusion matrix are neither data nor model parameters, \citet{zhang_how_2021}'s method is technically a ``pseudo-likelihood'' method. A clear benefit is that this method only requires summary quantities derived from manually annotated data, for instance via a confusion matrix. %We will discuss likelihood methods in greater depth in the presentation of our MLA framework below. +\emph{``Pseudo-likelihood''} methods (PL)—even if not always explicitly labeled this way—are another approach for correcting misclassification bias. \citet{zhang_how_2021} proposes a method that approximates the error model using quantities from the AC's confusion matrix—the positive and negative predictive values in the case of a mismeasured independent variable and the AC's false positive and false negative rates in the case of a mismeasured dependent variable. Because quantities from the confusion matrix are neither data nor model parameters, \citet{zhang_how_2021}'s method is technically a ``pseudo-likelihood'' method. A clear benefit is that this method only requires summary quantities derived from manually annotated data, for instance via a confusion matrix. \subsection{Proposing Maximum Likelihood Adjustment for Misclassification} -% This section basically translates Carroll et al. for a technically advanced 1st year graduate student. We now elaborate on \emph{Maximum Likelihood Adjustement} (MLA), a new method we propose for correcting misclassification bias. Our method tailors \citet{carroll_measurement_2006}'s presentation of the general statistical theory of likelihood modeling for measurement error correction to the context of automated content analysis.\footnote{In particular see Chapter 8 (especially example 8.4) and Chapter 15. (especially 15.4.2).} The MLA approach deals with misclassification bias by maximizing a likelihood that correctly specifies an \emph{error model} of the probability of the automated classifications conditional on the true value and the outcome \citep{carroll_measurement_2006}. In contrast to the GMM and the MI approach, which predict values of the misclassified variable, the MLA method accounts for all possible values of the variable by ``integrating them out'' of the likelihood. ``Integrating out'' means adding possible values of a variable to the joint likelihood, weighted by the likelihood of the error model. MLA methods have four advantages in the context of ACs that reflect the benefits of integrating out partially observed discrete variables. First, they are general in that they can be applied to any model with a convex likelihood including generalized linear models (GLMs) and generalized additive models (GAMs). Second, assuming the model is correctly specified, MLA estimators are fully consistent whereas regression calibration estimators are only approximately consistent \citep{carroll_measurement_2006}. Practically, this means that MLA methods can have greater statistical efficiency and require less manually annotated data to make precise estimates. -%The MLA approach is conceptually different from the GMM one. The GMM approach first imputes likely values and then runs the main analysis on imputed values. By contrast, MLA approaches estimate—all in one step—the main analysis using the full dataset and the error model estimated using only the validation data \citep{carroll_measurement_2006}. Third, the MLA approach is applicable both for correcting for misclassification in a dependent and an independent variable. Fourth, and most important, MLA can be effective when misclassification is systematic. -%The idea is to use an \emph{error model} of the conditional probability of the automatic classifications given the true classifications and other variables on which automatic classifications depend. -%In other words, the error model estimates the conditional probability mass function of the automatic classifications. - -% When a variable is measured with error, this error introduces uncertainty. The overall idea of correcting an analysis with a mismeasured variable through likelihood modeling is to use - -%Including the error model in the likelihood effectively accounts for uncertainty of the true classifications and, assuming the error model gives consistent estimates of the conditional probability of the automatic classifications given the true values, is sufficient to obtain consistent estimates using MLA \citep{carroll_measurement_2006}. - \subsubsection{When an Automated Classifier Predicts an Independent Variable} In general, if we want to estimate a model $P(Y|\Theta_Y, X, Z)$ for $Y$ given $X$ and $Z$ with parameters $\Theta_Y$, we can use AC classifications $W$ predicting $X$ to gain statistical power without introducing misclassification bias by maximizing ($\mathcal{L}(\Theta|Y,W)$), the likelihood of the parameters $\Theta = \{\Theta_Y, \Theta_W, \Theta_X\}$ in a joint model of $Y$ and $W$ \citep{carroll_measurement_2006}. @@ -394,14 +289,14 @@ Additional details on the likelihood modeling approach are available in Appendix \section{Evaluating Misclassification Models: Monte Carlo Simulations} -% \TODO{Create a table summarizing the simulations and the parameters.} + We now present four Monte Carlo simulations (\emph{Simulations 1a}, \emph{1b}, \emph{2a}, and \emph{2b}) with which we evaluate existing methods (GMM, MI, PL) and our approach (MLA) for correcting misclassification bias. Monte Carlo simulations are a tool for evaluating statistical methods, including (automated) content analysis \citep[e.g.,][]{song_validations_2020,bachl_correcting_2017,geis_statistical_2021, fong_machine_2021,zhang_how_2021}. They are defined by a data generating process from which datasets are repeatedly sampled. Repeating an analysis for each of these datasets provides an empirical distribution of results the analysis would obtain over study replications. Monte Carlo simulation affords exploration of finite-sample performance, robustness to assumption violations, comparison across several methods, and ease of interpretability \citep{mooney_monte_1997}. Such simulations allow exploration of how results depend on assumptions about the data-generating process and analytical choices and are thus an important tool for designing studies that account for misclassification. -% Code for reproducing our simulations is available here: \url{https://osf.io/pyqf8/?view_only=c80e7b76d94645bd9543f04c2a95a87e}. + @@ -412,7 +307,7 @@ We develop our MLA approach in the R package \texttt{misclassificationmodels}. For PL and MLA, we quantify uncertainty using the Fisher information quadratic approximation.\footnote{The code for reproducing our simulations and our experimental R package is available here: \url{https://osf.io/pyqf8/?view_only=c80e7b76d94645bd9543f04c2a95a87e}.} In addition, we compare these error correction methods to two common approaches in communication science: the \emph{feasible} estimator (i.e., conventional content analysis that uses only manually annotated data and not ACs) -%and illustrates the motivation for using an AC in these scenarios—validation alone provide insufficient statistical power for a sufficiently precise hypothesis test. + and the \emph{naïve} estimator (i.e., using AC-based classifications $W$ as stand-ins for $X$, thereby ignoring misclassification). According to our systematic review, the \emph{naïve} approach reflects standard practice in studies employing SML for text classification. @@ -420,22 +315,12 @@ We evaluate each of the six analytical approaches in terms of \emph{consistency} To evaluate efficiency, we repeat each simulation with different amounts of total observations, i.e., unlabeled data to be classified by an AC (ranging from \Sexpr{min(N.sizes)} to \Sexpr{max(N.sizes)} observations), and manually annotated observations (ranging from \Sexpr{min(m.sizes)} to \Sexpr{max(m.sizes)} observations). Since our review indicated that ACs are most often used to create binary variables, we restrict our simulations to misclassifications related to a binary (in-)dependent variable. -%\begin{equation} -% Y= B_0^* + B_1^*W + B_2^*Z + \varepsilon^* = B_0^* + B_1^*(X + \xi) + B_2^*Z -%\label{mod:measerr.ols} -%\end{equation} - - -%These simulations are designed to verify that error correction methods from prior work are effective in ideal scenarios and to create the simplest possible cases where these methods are inconsistent. Showing how prior methods fail is instructive for understanding how our MLA approach does better both in these artificial simulations and in practical projects. - \subsection{Four Prototypical Scenarios for Our Monte Carlo Simulations} We simulate regression models with two independent variables ($X$ and $Z$). This sufficiently constrains our study's scope but the scenario is general enough to be applied in a wide range of research studies. -%Simulating studies with two covariates lets us study how measurement error in one covariate can cause bias in coefficient estimates of other covariates. Whether the methods we evaluate below are effective or not depends on the conditional dependence structure among independent variables, the dependent variable $Y$, and automated classifications $W$. This structure determines if adjustment for systematic misclassification is required \citep{carroll_measurement_2006}. In Figure \ref{bayesnets}, we illustrate our scenarios via Bayesian networks representing the conditional dependence structure of variables \citep{pearl_fusion_1986}: -%In these figures, an edge between two variables indicates that they have a direct relationship. Two nodes that are not neighbors are statistically independent given the variables between them on the graph. For example, in Figure \ref{fig:simulation.1a}, the automatic classifications $W$ are conditionally independent of $Y$ given $X$ because all paths between $W$ and $Y$ contain $X$. This indicates that the model $Y=B_0 +B_1 W+ B_2 Z$ (the \emph{naïve estimator}) has non-differential error because the automatic classifications $W$ are conditionally independent of $Y$ given $X$. However, in Figure \ref{fig:simulation.1b}, there is an edge between $W$ and $Y$ to indicate that $W$ is not conditionally independent of $Y$ given other variables. Therefore, the naïve estimator has differential error. We first simulate two cases where an AC measures an independent variable without (\emph{Simulation 1a}) and with differential error (\emph{Simulation 1b}). Then, we simulate using an AC to measure the dependent variable, either one with misclassifications that are uncorrelated (\emph{Simulation 2a}) or correlated with an independent variable (\emph{Simulation 2b}). GMM is not designed to correct misclassifications in dependent variables, so we omit this method in \emph{Simulations 2a} and \emph{2b}. \input{bayesnets.tex} @@ -447,42 +332,18 @@ We first consider studies with the goal of testing hypotheses about the coeffici Y=B_0 + B_1 X + B_2 Z + \varepsilon \label{mod:true.ols} \end{equation} -%In our first real-data example, $Y$ was a discrete variable---whether a comment self-disclosed a racial or ethnic identity, $X$ was if a comment was toxic, and $Z$ was the number of likes. + In this simulated example, $Y$ is continuous variable, $X$ is a binary variable measured with an AC, and $Z$ is a normally distributed variable with mean 0 and standard deviation \Sexpr{sim1.z.sd} measured without error. -%The simulated example could represent a study of $Y$, the time until an social media account is banned, $X$ if the account posted a comment including toxicity, and $Z$ the account's reputation score. $X$ and $Z$ are negatively correlated because high-reputation accounts may be less likely to post comments including toxicity. -%$Z$ can indicate if the message is in German or English, the two possible languages in the hypothetical study. -%Say that human content coders can observe $X$ perfectly, but each observation is so expensive that observing $X$ for a large sample is infeasible. -%Instead, the human coders can measure $X$ without error for a subsample of size $m << N$. -%To scale up content analysis, a SML-based AC makes predictions $W$ of $X$—for instance predicting if any comments from a social media user include toxicity. -Both simulations have a normally distributed dependent variable $Y$ and two binary independent variables $X$ and $Z$, which are balanced ($P(X)=P(Z)=0.5$) and correlated (Pearson's $\rho=\Sexpr{round(sim2a.cor.xz,2)}$). %Simulating balanced covariates serves simplicity so that accuracy is adequate to quantify the predictive performance of our simulated classifier. Simulating correlated covariates is helpful to study how misclassification in one variable affects parameter inference in other covariates. +Both simulations have a normally distributed dependent variable $Y$ and two binary independent variables $X$ and $Z$, which are balanced ($P(X)=P(Z)=0.5$) and correlated (Pearson's $\rho=\Sexpr{round(sim2a.cor.xz,2)}$). To represent a study design where an AC is needed to obtain sufficient statistical power, $Z$ and $X$ can explain only \Sexpr{format.percent(sim1.R2)} of the variance in $Y$. -% TODO, bring back when these simulations are in the appendix. -%Additional simulations in appendix \ref{appendix:sim1.imbalanced} show results for variations of \emph{Simulation 1} with imbalanced covariates explaining a range of variances, different classifier accuracies, heteroskedastic misclassifications and deviance from normality in the an outcome $Y$. In \emph{Simulation 1a} (Figure \ref{fig:simulation.1a}), we simulate an AC with \Sexpr{format.percent(sim1a.acc)} accuracy.\footnote{Classifier accuracy varies between our simulations because it is difficult to jointly specify classifier accuracy and the required correlations among variables and due to random variation between simulation runs. We report the median accuracy over simulation runs.} This reflects a situation where $X$ may be difficult to predict, but the AC, represented as a logistic regression model having linear predictor $W^*$ provides a useful signal. We simulate nondifferential misclassification because $W=X+\xi$, $\xi$ is normally distributed with mean $0$, and $\xi$ and $W$ are conditionally independent of $Y$ given $X$ and $Z$. -%($P(\xi| Y,X,Z) = P(\xi|X,Z)$). -%For simplicity, the AC's errors $\xi$ are independent of all other variables. In Appendix F, we demonstrate that the methods we study perform similarly when $\xi$ is heteroskedastic, correlated with $X$ or $Z$. Note that heteroskedasticity does not imply differential error. Suppose, for example, that AC's accuracy predicting rule violations $W$ depends on language $Z$. As a result, $\xi$ and $Z$ are correlated, and since time-till-ban $Y$ and repuation $Z$ are also correlated, $\xi$ is in turn correlated with $Y$. Despite this, the error in Model \ref{mod:measerr.ols} remains nondifferential, because $Y$ is conditionally independent of $\xi$ given $Z$ and $X$. - -% Measuring $X$ is expensive, perhaps requiring trained human annotators, but an automated classifier can predict $X$ with We choose this level of accuracy to reflect a situation where $X$ may be difficult to predict - -% The classifier, perhaps a proprietary API, has unobservable features $K$. The classifier's predictions $W=X + \xi$ are unbiased—the errors $\xi$ are not correlated with $Y$,$X$ or $Z$. Figure \ref{fig:simulation.1} shows a Bayesian network representing \emph{Simulation 1}'s conditional dependencies of $Z$, $Y$, $K$, $Z$ and $W$ as a directed acyclic graph (DAG). - -% \emph{Simulation 2} extends \emph{Simulation 1} by making the automated classifier classification errors $\xi$ that are correlated with $Y$ even after accounting for $Z$ and $x$. - In our real-data example, we included an example where the Perspective API disproportionately misclassified comments as toxic if they disclosed aspects of identities which resulted in differential misclassification. In \emph{Simulation 1b} (Figure \ref{fig:simulation.1b}), we test how error correction methods can handle such differential error by making AC predictions similarly depend on the dependent variable $Y$. This simulated AC has $\Sexpr{format.percent(sim1b.acc)}$ accuracy and makes predictions $W$ that are negatively correlated with the residuals of the linear regression of $X$ and $Z$ on $Y$ (Pearson's $\rho=\Sexpr{round(sim1b.cor.resid.w_pred,2)}$). As a result, this AC makes fewer false-positives and more false-negatives at greater levels of $Y$. -%Although the false-negative rate of the AC is \Sexpr{format.percent(sim1b.fnr)} overall, when $Y<=0$ the false-negative rate is only \Sexpr{format.percent(sim1b.fnr.y0)}, but when $Y>=0$ it rises to \Sexpr{format.percent(sim1b.fnr.y1)}. -%Figure \ref{fig:simulation.1b} shows a Bayesian network representing conditional dependencies of $Z$, $Y$, $Z$ and $W$ in \emph{Simulation 1b}. -%This is prototypical of an AC that influences behavior in a system under study. - - -% False negatives may cause delays in moderation increasing $Y$ (time-until-ban), while false-positives could draw moderator scrutiny and cause them to issue speedy bans. -% This mechanism is not mediated by observable variables such as reputation ($Z$) or the true use of toxicity ($X$). Therefore, we expect differential error. - \subsubsection{Measurement Error in a Dependent Variable (\textit{Simulation 2a} and \textit{2b})} We then simulate using an AC to measure the dependent variable $Y$ which we aim to explain given a binary independent variable $X$ and a continuous independent variable $Z$. The goal is to estimate $B_1$ and $B_2$ in the following logistic regression model: @@ -492,27 +353,21 @@ We then simulate using an AC to measure the dependent variable $Y$ which we aim \label{mod:measerr.logit} \end{equation} -%As was true for $X$ in \emph{Simulation 1}, human coders can observe $Y$ but doing so may be costly. We may thus instead use an AC that makes predictions $W = Y + \xi$ . - -%\noindent In our second real-data example, $Y$ is if a comment contains toxicity, $X$ is if the comment discloses racial or ethnic identity, and $Z$ is the number of times the comment was ``liked''. - In \emph{Simulation 2a} (see Figure \ref{fig:simulation.2a}) and \emph{Simulation 2b} (see Figure \ref{fig:simulation.2b}) $X$ and $Z$ are, again, balanced ($P(X)=P(Z)=0.5$) and correlated (Pearson's $\rho=\Sexpr{round(sim2a.cor.xz,2)}$). -%As in \emph{Simulation 1} we simulate scenarios where an AC is of practical use to estimate subtle relationships. In \emph{Simulation 1}, we chose the variance of the normally distributed outcome given our chosen coefficients $B_X$ and $B_Z$, but this is not appropriate for \emph{Simulation 2}'s logistic regression. We therefore choose, somewhat arbitrarily, $B_X=\Sexpr{sim2.Bx}$ and $B_Z=\Sexpr{sim2.Bz}$. We again simulate ACs with moderate predictive performance. The AC in \emph{Simulation 2a} is \Sexpr{format.percent(sim2a.AC.acc)} accurate and the AC in \emph{Simulation 2b} is \Sexpr{format.percent(sim2b.AC.acc)} accurate. In \emph{Simulation 2a}, the misclassifications are nonsystematic as $\xi$ has mean $0$ and is independent of $X$ and $Z$. However, in \emph{Simulation 2b} the misclassifications $\xi$ are systematic and correlated with $Z$ (Pearson's $\rho = \Sexpr{round(sim2b.error.cor.z,2)}$). -% Such differential error may arise if social media users are adept at skirting the rules without violating them. Such members are both likely to be warned by moderators and to leave comments misclassified as toxic. \section{Simulation Results} For each method, we visualize the consistency, efficiency, and the accuracy of uncertainty quantification of estimates across prototypical scenarios. -%Our main results are presented as plots visualizing the consistency (i.e., does the method, on average, recover the true parameter?), efficiency (i.e., how precise are estimates and does precision improve as sample size increases?), and the accuracy of uncertainty quantification of each method in each scenario. + For example, Figure \ref{fig:sim1a.x} visualizes results for \emph{Simulation 1a}. Each subplot shows a simulation with a given total sample size (No. observations) and a given sample of manually annotated observations (No. manually annotated observations). To assess a method's consistency, we locate the expected value of the point estimate across simulations with the center of the black circle. As an example, see the leftmost column in the bottom-left subplot of Figure \ref{fig:sim1a.x}. For the naïve estimator, the circle is far below the dashed line indicating the true value of $B_X$. Here, ignoring misclassification causes bias toward 0 and the estimator is inconsistent. To assess a method's efficiency, we mark the region in which point estimate falls in 95\% of the simulations with black lines. The black lines in the bottom-left subplot of Figure \ref{fig:sim1a.x} for example show that the feasible estimator, which uses only manually annotated data, is consistent but less precise than estimates from error correction methods. To assess each method's uncertainty quantification, compare the gray lines, which show the expected value of a method's approximate 95\% confidence intervals across simulations, to the corresponding black lines. The \emph{PL} column in the bottom-left subplot of Figure \ref{fig:sim1a.x} for instance shows that the method's 95\% confidence interval is biased towards 0 when the number of manually annotated observations is smaller. This is to be expected because the PL estimator does not account for uncertainty in misclassification probabilities estimated using the sample of manually annotated observations. - %Now that we have explained how to interpret our plots, we unpack them for each simulated scenario. + \subsection{\emph{Simulation 1a:} Nonsystematic Misclassification of an Independent Variable} @@ -530,7 +385,7 @@ grid.draw(p) @ \caption{Simulation 1a: Nonsystematic misclassification of an independent variable. Error correction methods, except for PL, obtain precise and accurate estimates given sufficient manually annotated data. \label{fig:sim1a.x}} \end{figure} -%It is important to correct misclassification error even when an AC is only used as a statistical control \citep[for example]{weld_adjusting_2022}, because when a covariate $Z$ is correlated with $X$, misclassifications of $X$ cause bias in the \emph{naïve} estimates of $B_Z$, the regression coefficient of $Z$ on $Y$. As Figure \ref{fig:sim1a.z} in Appendix \ref{appendix:main.sim.plots} shows, methods that effectively correct estimates of $X$ in \emph{Simulation 1a} also correct estimates of $B_Z$. + In brief, when misclassifications cause nondifferential error, MLA and GMM are effective, efficient, and provide accurate uncertainty quantification. They complement each other due to different assumptions: MLA depends on correctly specifying the likelihood but its robustness to incorrect specifications is difficult to analyze \citep{carroll_measurement_2006}. The GMM approach depends on the exclusion restriction instead of distributional assumptions \citep{fong_machine_2021}. MLA's advantage over GMM come from the relative ease with which it can be extended to for instance generalized linear models (GLMs) or generalized additive models (GAMs). In cases similar to \emph{Simulation 1a}, we therefore recommend both GMM and MLA to correct for misclassification. @@ -549,7 +404,6 @@ grid.draw(p) Figure \ref{fig:sim1b.x} illustrates \emph{Simulation 1b}. Here, systematic misclassification gives rise to differential error and creates more extreme misclassification bias that is more difficult to correct. As Figure \ref{fig:sim1b.x} shows, the naïve estimator is opposite in sign to the true parameter. Of the four methods we test, only the MLA and the MI approach provide consistent estimates. This is expected because they use $Y$ to adjust for misclassifications. The bottom row of Figure \ref{fig:sim1b.x} shows how the precision of the MI and MLA estimates increase with additional observations. As in \emph{Simulation 1a}, MLA uses this data more efficiently than MI does. However, due to the low accuracy and bias of the AC, additional unlabeled data improves precision less than one might expect. Both methods provide acceptably accurate confidence intervals. Figure \ref{fig:sim1b.z} in Appendix \ref{appendix:main.sim.plots} shows that, as in \emph{Simulation 1a}, effective correction for misclassifications of $X$ is required to consistently estimate $B_Z$, the coefficient of $Z$ on $Y$. Inspecting results from methods that do not correct for differential error is useful for understanding their limitations. When few annotations of $X$ are observed, GMM is nearly as bad as the naïve estimator. PL is also visibly biased. Both improve when a greater proportion of the data is labeled since they combine AC-based estimates with the feasible estimator. - In sum, our simulations suggest that the MLA approach is superior in conditions of differential error. Although estimations by the MI approach are consistent, the method's practicality is limited by its inefficiency. \subsection{\emph{Simulation 2a:} Nonsystematic Misclassification of a Dependent Variable} @@ -566,13 +420,9 @@ grid.draw(p) Figure \ref{fig:sim2a.x} illustrates \emph{Simulation 2a}: nonsystematic misclassification of a dependent variable. This also introduces bias as evidenced by the naïve estimator's inaccuracy. Our MLA method is able to correct this error and provide consistent estimates. Surprisingly, the MI estimator is inconsistent and does not improve with more human-labeled data. -%Note that the GMM estimator is not designed to correct misclassifications in the outcome. The PL approach is also inconsistent, especially when only few of all observations are annotated manually. It is closer to recovering the true parameter than the MI or the naïve estimator, but provides only modest improvements in precision compared to the feasible estimator. It is clear that the precision of the MLA estimator improves with more observations data to a greater extent than the PL estimator. -When the amount of human-labled data is low, inaccuracies in the 95\% confidence intervals of both the MLA and PL become visible due to the poor finite-sample properties of the quadradic approximation for standard errors. -%As before, PL's inaccurate confidence intervals are due to its use of finite-sample estimates of automated classification probabilities. -%In both cases, the poor finite-sample properties of the fischer-information quadratic approximation contribute to this inaccuracy. In Appendix \ref{appendix:sim1.profile}, we show that the MLA method's inaccuracy vanishes when using the profile-likelihood method instead. - +When the amount of human-labled data is low, inaccuracies in the 95\ In brief, our simulations suggest that MLA is the best error correction method when random misclassifications affect the dependent variable. It is the only consistent option and more efficient than the PL method, which is almost consistent. \subsection{\emph{Simulation 2b}: Systematic Misclassification of a Dependent Variable} @@ -596,9 +446,6 @@ Therefore, our simulations suggest that MLA is the best method when misclassific \section{Transparency about Misclassification Is Not Enough—We Have To Fix It! Recommendations for Automated Content Analysis} ``Validate, Validate, Validate'' \citep[p. 269]{grimmer_text_2013} is one of the guiding mantras for automated content analysis. It reminds us that ACs can produce misleading results and of the importance of steps to ascertain validity, for instance by making misclassification transparent. -%\citet[p.5]{grimmer_text_2013} write that -%``when categories are known [...], scholars must demonstrate that the supervised methods are able to reliably replicate human coding.'' -%This suggests that quantifying an AC's predictive performance by comparing human-labeled validation data to automated classifications sufficiently establishes an AC's validity and thereby the validity of downstream analyses. Like \citet{grimmer_text_2013}, we are deeply concerned that computational methods may produce invalid evidence. In this sense, their validation mantra animates this paper. But transparency about misclassification rates via metrics such as precision or recall leaves unanswered an important question: Is comparing automated classifications to some external ground truth sufficient to claim that results are valid? Or is there something else we can do and should do? We think there is: Using statistical methods to not only quantify but also correct for misclassification. Our study provides several recommendations in this regard, as summarized in Figure \ref{fig:FigureRecommendations}. @@ -610,12 +457,6 @@ We think there is: Using statistical methods to not only quantify but also corre \label{fig:FigureRecommendations} \end{figure} -% \includegraphics{Recommendations.PNG} - - - -%Similar to recent work in communication science \citep{mahl_noise_2022, stoll_supervised_2020}, our goal is not only to \textit{highlight} and \textit{quantify} common pitfalls in automated content analysis applications of ACs but to also \textit{propose} constructive guidelines on the road ahead. - \subsubsection{Step 1: Attempt Manual Content Analysis} Manual content annotation is often done \textit{post facto}, for instance to calculate predictiveness of an already existing AC such as Google's Perspective classifier. We propose to instead use manually annotated data \textit{ante facto}, i.e. before building or validating an AC. @@ -625,27 +466,15 @@ Often, ACs are seen as a cost-saving procedure but scholars often fail to consid Moreover, validating an existing AC or building a new AC is also expensive, for instance due to costs of computational resources or manual annotation of (perhaps smaller) test and training datasets. We therefore caution researchers against preferring automated over manual content analysis unless doing so is necessary to obtain useful evidence. We agree with \citet{baden_three_2022} who argue that ``social science researchers may be well-advised to eschew the promises of computational tools and invest instead into carefully researcher-controlled, limited-scale manual studies'' (p. 11). In particular, we recommend using manually annotated data \textit{ante facto}: Researchers should begin by examining human-annotated data so to discern if an AC is necessary. In our simulations, the feasible estimator is less precise but consistent in all cases. So if fortune shines and this estimate sufficiently answers one's research question, manual coding is sufficient. Here, scholars should rely on existing recommendations for descriptive and inferential statistics when using manual content analysis \citep{geis_statistical_2021, bachl_correcting_2017}. If the feasible estimator however fails to provide convincing evidence, for example by not rejecting the null, manually annotated data is not wasted. It can be reused to build an AC or correct misclassification bias. -%One potential problem of this \textit{ante facto} approach is that conducting two statistical tests of the same hypothesis increases the chances of false discover. A simple solution to this is to adjust the significance threshold $\alpha$ for drawing conclusions from the feasible estimate. %We recommend p < .01. %That said, it might useful use an AC in a preliminary analysis, prior to collecting validation data when an AC such as one available from an API, is available for reuse and confusion matrix quantities necessary for the pseudo-likelihood (PL) method are published. Although (PL) is inconsistent when used for a covariate, this can be corrected if the true rate of $X$ can be estimated. -%Caution is still warranted because ACs can perform quite differently from one dataset to another so we recommend collecting validation representative of your study's dataset and using another appropriate method for published studies. \subsubsection{Step 2: Use Manually Annotated Data to Detect Systematic Misclassification} -% Let's suppose an AC is used to the feasible estimator is insufficiently informative -%There are many guides on how to train and validate ACs \citep[e.g.][]{grimmer_text_2013,van_atteveldt_validity_2021}. However, they mostly refer to performance metrics such as the F1-score or Area under the Curve (AUC). The problem with this approach is that such criteria make misclassifications transparent but do not provide information on how misclassification will affect downstream analyses and how to correct for such effects. -%One reason for this is that such criterion do not account for differential error or for correlation between misclassifications in the outcome and a regression covariate—both of which can give rise to extremely misleading statistics. As demonstrated in our simulations, knowing whether an AC makes systematic misclassifications is important: It determines which correction methods can work. Fortunately, manually annotated data can be used to detect systematic misclassification. For example, \citet{fong_machine_2021} suggest using Sargan's J-test of the null hypothesis that the product of the AC's predictions and regression residuals have an expected value of 0. More generally, one can test if the data's conditional independence structures can be represented by Figures \ref{fig:simulation.1a} or \ref{fig:simulation.2a}. This can be done, for example, via likelihood ratio tests of $P(W|X,Z) = P(W|X,Y,Z)$ (if an AC measures an independent variable $X$) or of $P(W|Y) = P(W|Y,Z,X)$ (if an AC measures a dependent variable $Y$) or by visual inspection of plots of relating misclassifications to other variables \citep{carroll_measurement_2006}. We strongly recommend using such methods to test for systematic misclassification and to design an appropriate correction. -% For example, ``algorithmic audits'' \citep[e.g.,][]{rauchfleisch_false_2020, kleinberg_algorithmic_2018} evaluate the performance of AC across different subgroups in the data. - -% This may be important when using different ACs for corpora of different languages or data from different social media platforms. If the accuracy of an AC varies with language or platforms, we may expect differential error. - -% In turn, differential misclassification can be ruled out if the performance of an AC is the same across all analytically relevant subgroups and other variables. - - \subsubsection{Step 3: Correct for Misclassification Bias Instead of Being Naïve} Across our simulations, we showed that the naïve estimator is biased. Testing different error correction methods, we found that these generate different levels of consistency, efficiency, and accuracy in uncertainty quantification. That said, our proposed MLA method should be considered as a versatile method because it is the only method capable of producing consistent estimates in prototypical situations studied here. We recommend the MLA method as the first ``go-to'' method. As shown in Appendix \ref{appendix:robustness}, this method requires specifying a valid error model to obtain consistent estimates. One should take care that the model not have omitted variables including nonlinearities and interactions. @@ -663,10 +492,6 @@ recommending that researchers report methodological decisions so other can under In our review, we found that reporting such decisions is not yet common, at least in the context of SML-based text classification. When correcting for misclassification, uncorrected results will often provide a lower-bound on effect sizes; corrected analyses will provide more accurate but less conservative results. Therefore, both corrected and uncorrected estimates should be presented as part of making potential multiverses of findings transparent. -% we -% To report instead of hiding methodological decisions and related uncertainty that may emerge in generated results, -%We realize that researchers might need to cut methodological information, especially for empirical studies, to conform to either word limits or reviewers. If word limitations are the problem, this information could be reported in appendices. -% Here, the field might consider adopting ---or adapting--- machine learning reporting standards such as DOME (Computational Biology) and PRIME (Diagnostic medicine). \section{Conclusion and Limitations} @@ -686,7 +511,7 @@ Based on these results, we provide four recommendations for the future of automa Our study has several limitations. First, the simulations and methods we introduce focus on misclassification by automated tools. They provisionally assume that human annotators do not make errors, especially not systematic ones. This assumption can be reasonable if intercoder reliability is very high but, as with ACs, this may not always be the case. -%Alternatively, validation data can be treated as a gold standard if the goal is measuring \emph{how a person categorizes content}, as opposed to the more common approach of measuring presumably objective content categories. That said, the prevailing approaches in content analysis use human coders to measure a latent category who are prone to misclassification. + Thus, it may be important to account for measurement error by human coders \citep{bachl_correcting_2017} and by automated classifiers simultaneously. In theory, it is possible to extend our MLA approach in order to do so \citep{carroll_measurement_2006}. However, because the true values of content categories are never observed, accounting for automated and human misclassification at once requires latent variable methods that bear considerable additional complexity and assumptions \citep{pepe_insights_2007}. We leave the integration of such methods into our MLA framework for future work. In addition, our method requires an additional assumption that the error model is correct. As we argue in Appendix \ref{appendix:robustness} (section \ref{appendix:assumption}), this assumption is often acceptable. Second, the simulations we present do not consider all possible factors that may influence the performance and robustness of error correction methods including classifier accuracy, heteroskedasticity, and violations of distributional assumptions. We are working to investigate such factors, as shown in Appendix \ref{appendix:robustness}, by extending our simulations. @@ -824,7 +649,6 @@ Equation \ref{eq:mle.covariate.chainrule.3} shows a different way to factor the Equation \ref{eq:mle.covariate.chainrule.4} factors $P(Y,X=x)$ the joint probability of $Y$ and $X$ into $P(Y|X=x)$, the conditional probability of $Y$ given $X$, $P(W|X=x,Y)$, the conditional probability of $W$ given $X$ and $Y$, and $P(X=x)$ the probability of $X$. This shows that fitting a model $Y$ given $X$ in this framework, such as the regression model $Y = B_0 + B_1 X + B_2 Z$ requires including the exposure model for $P(X=x)$. Without validation data, $P(X=x)$ is difficult to calculate without strong assumptions \citep{carroll_measurement_2006}, but $P(X=x)$ can easily be estimated using a sample of validation data. -%Our appendix includes supplementary simulations that explore how robust our method to model mispecification. Equations \ref{eq:mle.covariate.chainrule.1}--\ref{eq:mle.covariate.chainrule.4} demonstrate the generality of this method because the conditional probabilities may be calculated using a wide range of probability models. For simplicity, we have focused on linear regression for the probability of $Y$ and logistic regression for the probability of $W$ and the probability of $X$. However, more flexible probability models such as generalized additive models (GAMs) or Gaussian process classification may be useful for modeling nonlinear conditional probability functions \citep{williams_bayesian_1998}. @@ -939,7 +763,6 @@ Here, we demonstrate how misspecification of the error correction model affects \subsubsection{Systematic Misclassification of an Independent Variable} \label{appendix:misspec.iv} Repeating \emph{Simulation 1b}, what happens when the error model is misspecified? Figure \ref{fig:iv.noz} visualizes effects on $B_X$ (upper panel) and $B_Z$ (lower panel). It shows that a misspecified MLA model is unable to fully correct misclassification bias: Although estimates of $B_X$ are close to the true estimate and estimates of $B_Z$ are better than the näive estimator, $B_Z$ is still clearly biased. -%Here we refer to $P(Y|X,Z,\Theta_Y)$ as the ``outcome model'', $P(W|Y,X,Z,\Theta_W)$ as the ``proxy model'', and $P(X|Z,\Theta_X)$ as the ``truth model''. \begin{figure}[htpb!] @@ -1168,195 +991,4 @@ grid.draw(p) \label{fig:dv.degreebias} \end{figure} -%However, if one can assume the model for $Y$, then one believes that $Y$ and $X$ are conditionally independent given other observed variables. - -% \section{Addit equation ional simulations} -% \subsection{Heteroskedasktic but nondifferential misclassifications}\label{appendix:sim1.hetero} - -% \subsection{Imbalanced covariates} -% \label{appendix:sim1.imbalanced} \end{document} - -\subsection{Profile likelihood improves uncertainty quantification} -\label{appendix:sim1.profile} - -\section{Four prototypical scenarios} - -We must clearly distinguish four types of measurement error that arise in this context. -The first type occurs when a covariate is measured with error and this error can be made statistically independent of the outcome by conditioning on other covariates. In this case the error is called nondifferential. -The second type, differential error occurs when a covariate is measured with error that is systematically correlated with the outcome, even after accounting for the other covariates \citep{carroll_measurement_2006}. -These two types of error apply when an AC is used to measure a covariate. -When an AC is used to measure an outcome, errors can be random—uncorrelated with the covariates or they can be systematic—correlated with a covariate. - -nondifferential measurement error and random error in the outcome are relatively straightforward to correct. We will argue below that differential measurement error can be avoided when an AC is carefully designed. Yet the risk of differential measurement error is considerable in such cases as multilingual text classification because the ease of classification may systematically vary in relation to the outcome and covariates or as when a model trained in one context is applied in another. - -Research using ACs based on supervised machine learning may be particularly prone to differential and systematic measurement error. Problems of bias and generalizability have machine learning field of machine learning more generally has - -%Statistical theory and simulations have shown that all these methods are effective (though some are more efficient) when ``ground-truth'' observations are unproblematic and when classifiers only make random, but not systematic, errors. We contribute by testing these methods in more difficult cases likely to arise in text-as-data studies. - -% -% All prior methods for correcting measurement error using validation data presume that the validation data is error-free. However, the methodological content analysis literature has extensively studied the difficulties in human-labeling theoretically and substantively significant content categories through the lens of inter-coder reliability. We contribute novel methods that account for both inter-coder reliability and machine classification error. - -Our monte-carlo simulations show that different error-correction methods fail in different cases and that none is always the best. For example, methods that can correct for differential error will be inefficient when none is present. In addition, Fong and Taylor \citep{fong_machine_2021}'s method-of-moments estimator exchanges distributional assumptions for an exclusion restriction and fails in different cases from methods based on parametric models, such as ours. - - -\subsection{Our Contributions} - -\begin{itemize} - \item Introduce this methodological problem to Communication Research; argue that this is not too far from ignoring disagreement in manual codings - \item Document the prevalence of automated content analysis to show the importance of the problem. - \item Summarize available statistical methods for adjusting for measurement error and bias. - \item Evaluate these methods in realistic scenarios to show when they work and when they do not. - \item Recommend best practices for applied automated content analysis. - \item Chart directions for future research to advance methods for automated content analysis. -\end{itemize} - -\section{Background} - -\subsection{Methods used to correct measurement error in simulation scenarios} - -We'll compare the performance of these methods in terms of: - -\begin{itemize} - \item Consistency: Does the method recover the true parameter on average? - \item Efficiency: How precise are the estimates? Does precision improve with sample size? - \item Robustness: Does the method work when parametric assumptions are violated? -\end{itemize} - -We'll run simulations that vary along these dimensions: - -\begin{itemize} - \item Explained variance (function of $B_XZ$ and $\varepsilon$) - \item Predictor accuracy (we'll always have balanced classes). - \item iterrater reliability - \item Data type of measured variable: binary / likert - \item Distribution of other variable: normal, lognormal, binary - \item Unlabeled sample size - \item Labeled sample size -\end{itemize} - - -\subsection{Explanation of Bayesian Networks / Causal Dags for representing scenarios} - -In this section we present the design of our simulation studies. So far I have designed the following three scenarios (though I have some work to do to polish them and fix bugs): - -\subsection{Definition of MLA Models} - -We model example 1 and 2, -\section{Discussion} - -\citet{fong_machine_2021} argue, and we agree, that a carefully designed AC can avoid forms of measurement error that are more difficult to deal with. However, tailoring an AC from scratch requires considerable effort and expense compared to reuse an AC developed for common purposes as the wide popularity that classifiers like LIWC and Perspective enjoy demonstrates. Our recommended approaches of GMM calibration, multiple imputation and likelihood modeling can all be concieved as fine-tuning steps that transform general purpose classifiers into tailored classifiers capable of providing reliable inferences. - -A natural response to the above extended meditation on measurement error in the context of automatic classifiers is to question the purpose of using ACs at all. It seems strange to think that by using model's predictions of a variable to build another model predicting that same variable we can solve the problems introduced by first model. Indeed, the more complex modeling strategies we propose are only necessary to correct the shortcomings of an AC. We envision ACs such as a commercial APIs, widely used dictionaries, or ACs that are generalized to new contexts that are likely to have such shortcomings because such ACs may provide information about a variable that would be difficult to obtain otherwise. - -Even though machine learning algorithms such as random forests might obtain greater performance at automatic classification, this comes at the expense of bias that may be difficult to model using validation data \citep{breiman_statistical_2001}. -Instead of tayloring an AC for a research study, using predictive features directly to infer missing validation data using multiple imputation or to model the probability of a variable in the likelihood modeling framework may be simpler and more likely to result in valid inferences. - -% A common strategy is to use a machine learning classifier $g(\mathbf{K})$ (e.g., the Perspective API) to obtain Often, researchers use the $N^*$ observations of $\mathbf{x}$ to build $\hat{\mathbf{w}}=g(\mathbf{Z})$. Other times they may use a different ``black-box'' model $g(\mathbf{Z})$ that is perhaps trained on a larger dataset different from that used to estimate $B$. - - -% Although it is often claimed that this bias is a conservative ``attenuation'' of estimates toward zero, this is only necessarily the case of ordinary linear regression with 2 variables when the bias is uncorrelated with $\mathbf{x}$ and $\mathbf{y}$ \citep{carroll_measurement_2006}. What's more, in conditions likely to occur in social scientific research, such as when the explained variance of the regression model is very low, the estimate of $\hat{B}^*$ can be \emph{more precise} than that of $\hat{B}$. As a result, the measurement error of a machine learning classifier is not always conservative but can result in false discovery \citep{carroll_measurement_2006}. - - - Note that specific forms of statistical bias are of particular concern for scientific measurement and although these may often be related to biases against social groups \cite[][e.g.]{obermeyer_dissecting_2019}, these notions of bias are not equivalent \cite{kleinberg_algorithmic_2018}. Introduce multi-lingual text classification as an example. - -(attenuation bias / correlation dilution), but this bias towards zero defeats the purpose of automated content analysis in the first place! -\subsection{Rationale} -\begin{itemize} - \item Automated content analysis is all the rage. Tons of people are doing it, but they all have the same problem: their models are inaccurate. They don't know if the model is accurate enough to trust their inferences. - - \item Social scientists often adopt performance criteria and standards for machine learning predictors used in computer science. These criteria do not tell how well a predictor works as a measurement device for a given scientific study. - - \item In general, prediction errors result in biased estimates of regression coefficients. In simple models with optimistic assumptions this bias will be conservative (attenuation bias / correlation dilution), but this bias towards zero defeats the purpose of automated content analysis in the first place! - - \item In more general scenarios (e.g., GLMs, differential error, multivariate regression), prediction errors can create bias that is not conservative. - - \item Statisticians have studied measurement error for a long time, and have developed several methods, but the settings they consider most often lack features of automated content analysis. Specifically: - - \begin{itemize} - \item The availability of (potentially inaccurate) validation data. (Most methods are designed for \emph{sensors} where the distribution of the error can be known, but error can be assumed to be nondifferential). - \item Differential error—the amount of noise is not independent of observations. - - \item The possibility of bias in addition to noise. - \end{itemize} - - \item Conducting simulations to evaluate existing methods including regression calibration, the extension of regression calibration by Fong and Taylor (2021) \cite{fong_machine_2021}, multiple imputation, and simulation extrapolation. - - \item These issues become even more important, and also more complex in important research designs such as those involving multiple languages. - - -% \subsection{Imperfect human-coded validation data} - -% All approaches stated above depend on the human-coded validation data $X^*$. Most often, ACs are also trained on human-coded material. The content analysis literature has long been documented how unreliable human coding and manual content analysis papers routinely report intercoder reliability as a result \citep{krippendorff_content_2018}. Intercoder reliability metrics typically assume that human coders are interchangeable and the only source of disagreement is ``coder idiosyncrasies'' \citep{krippendorff_reliability_2004}. A previous monte-carlo simulation operationalizes these ``coder idiosyncrasies'' as a fixed probability that a coder makes a random guess independent of the coder and of the material \citep{geis_statistical_2021}. In this work, we accept this ``interchangeable coders making random errors'' (ICMRE) assumption. Under this optimistic assumption, only ``coder idiosyncrasies'' cause misclassification error in the validation data. - -% \citet{song_validations_2020}'s monte-carlo simulation demonstrates that human-coded $X^*$ with a lower intercoder reliability generates more biased classification accuracy of the AC. So even if manual annotation errors are only due to the ICMRE assumption, they may bias results. None of the above correction approaches account for the imperfect human coding of $X^*$, although \citet{zhang_how_2021} identifies the omission of this as a weakness of his proposed approach. Even in the context of manual content analysis, these ``coder idiosyncrasies'' are not routinely adjusted (although methods are available, e.g. \citet{bachl_correcting_2017}). -% An advantage of our proposed method over prior approaches is that it automatically accounts for imperfection of human coding under the ICMRE assumption because the random errors in validation data are independent from the AC errors. - -% Precision of estimates can be improved using more than one independent coder. With two coders, for example, two sets of validation data are generated, $X^*_{1}$, $X^*_{2}$. We then list-wise delete all data that $X^*_{1} \neq X^*_{2}$. If the ICMRE assumption holds, the deleted data, where two coders disagree, can only be due to ``coder idiosyncrasies''. As coders are assumed to be interchangeable, the probability of two interchangeable coders both making the same misclassification error is much less than the probability that one makes a misclassification error . Using such ``labeled-only, coherent-only'' (LOCO) data improves the precision of consistent estimates in our simulation. - - -% \subsection{Measurement error in validation data} - -% The simulations above assume that validation data is perfectly accurate. This is obviously unrealistic because, validation data, such as that obtained from human classifiers, normally has inaccuracies. -% To evaluate the robustness of correction methods to imperfect validation data, we extend our scenarios with with nondifferential error with simulated validation data that is misclassified \Sexpr{format.percent(med.loco.accuracy)} of the time at random. - -\subsubsection{Recommendation II: Employ at Least Two Manual Coders, not One} - -Independent of whether researchers use manually annotated data for the feasible approach or AC, principles of manual content analysis, including justifying one's sample size, still apply. -%\citep[for details]{krippendorff_content_2018}. -%TODO uncomment below after ICA -Arguably, the most important problem in traditional content -analysis is whether human coders are capable of reliably classifying content into the categories under study. With multiple human coders labelling the same data, metrics such as Krippendorff's $\alpha$ -%and Gwet's $AC$ -can quantify ``intercoder reliability'' in terms of how often coders agree and disagree \citep{krippendorff_reliability_2004}. -These metrics all assume that disagreements are due to -``coder idiosyncrasies'' that are independent of the data \citep{krippendorff_reliability_2004}. - -We recommend that such metrics also be used to establish intercoder reliability in all of the human-labeled data, not only a smaller subset for intercoder testing. -Other than that, the gold standard data is also reused in later steps and those steps can be influenced by these ``coder idiosyncrasies'' \citep{song_validations_2020}. -We recommend that the gold standard data should be manually coded by two coders, not one. It allows the calculation of interrater reliability, a more accurate validation of the AC's performance, and better correction. With additional independent coders, would eliminate even more of these ``coder idiosyncrasies'' than two coders. - - - -However, the gains from introducing additional coders are diminishing so using more than two coders may not be cost effective. -\end{itemize} - - -\section{Accounting for errors in the validation data} - -In this section, we extend \emph{Simulation 1b} and \emph{Simulation 2b} with - - -\begin{figure} -<>= -#plot.df <- -p <- plot.simulation.irr(plot.df.example.5,'z') -grid.draw(p) -@ -\caption{Estimates of $B_Z$ in multivariate regression with $X$ measured using machine learning, with validation data collected by 2 independent coders that make random errors.} -\end{figure} -\begin{figure} -<>= -#plot.df <- -p <- plot.simulation.irr(plot.df.example.5,'x') -grid.draw(p) -@ -\caption{Estimates of $B_X$ in multivariate regression with $X$ measured using machine learning, with validation data collected by 2 independent coders that make random errors.} -\end{figure} - -\begin{figure} -<>= -#plot.df <- -p <- plot.simulation.irr.dv(plot.df.example.6,'z') -grid.draw(p) -@ -\caption{Estimates of $B_Z$ in multivariate regression with $Y$ measured using machine learning, with validation data collected by 2 independent coders that make random errors.} -\end{figure} -\begin{figure} -<>= -#plot.df <- -p <- plot.simulation.irr.dv(plot.df.example.6,'x') -grid.draw(p) -@ -\caption{Estimates of $B_X$ in multivariate regression with $Y$ measured using machine learning, with validation data collected by 2 independent coders that make random errors.} -\end{figure} diff --git a/bayesnets.tex b/bayesnets.tex index ef5bb97..0437d13 100644 --- a/bayesnets.tex +++ b/bayesnets.tex @@ -16,19 +16,12 @@ \node[observed] (y) {$Y$}; \node[unobserved, above=of y] (x) {$X$}; \node[observed, left=of x] (w) {$W$}; - -% \node[unobserved, above=of w] (k) {$K$}; \node[observed,right=of x] (z) {$Z$}; -% \node[residual,below=of y] (e) {$\varepsilon$}; -% \node[residual,below=of w] (xi) {$\xi$}; \draw[-] (z) to (y); \draw[-] (z) -- (x); \draw[-] (x) -- (y); \draw[-] (x) -- (w); -% \draw[-] (y) -- (w); -% \draw[-] (x) -- (xi); - % \draw[-] (w) -- (xi); \end{tikzpicture} \caption{In \emph{Simulation 1a}, classifications $W$ are conditionally independent of $Y$ so a model using $W$ as a proxy for $X$ has non-differential error. \label{fig:simulation.1a}} @@ -41,25 +34,18 @@ \node[observed] (y) {$Y$}; \node[unobserved, above=of y] (x) {$X$}; \node[observed, left=of x] (w) {$W$}; - -% \node[unobserved, above=of w] (k) {$K$}; \node[observed,right=of x] (z) {$Z$}; -% \node[residual,below=of y] (e) {$\varepsilon$}; -% \node[residual,below=of w] (xi) {$\xi$}; \draw[-] (z) to (y); \draw[-] (z) -- (x); \draw[-] (x) -- (y); \draw[-] (x) -- (w); -% \draw[-] (k) -- (w); \draw[-] (x) to (y); \draw[-] (w) -- (y); -% \draw[-] (x) -- (xi); -% \draw[-] (z) -- (xi); -% \draw[-] (w) -- (xi); + \end{tikzpicture} \caption{In \emph{Simulation 1b}, the edge from $W$ to $Y$ signifies that the automatic classifications $W$ are not conditionally independent of $Y$ given $X$, indicating differential error. \label{fig:simulation.1b} @@ -75,16 +61,12 @@ \node[observed, above=of y] (x) {$X$}; \node[observed, right=of y] (w) {$W$}; -% \node[unobserved, above=of w] (k) {$K$}; + \node[observed,right=of x] (z) {$Z$}; -% \node[residual,below=of y] (e) {$\varepsilon$}; - % \node[residual,below=of w] (xi) {$\xi$}; \draw[-] (z) to (y); \draw[-] (x) -- (y); \draw[-] (y) -- (w); \draw[-] (x) -- (z); -% \draw[-] (k) -- (w); - % \draw[-] (w) -- (xi); \end{tikzpicture} \caption{In \emph{Simulation 2a}, an unbiased classifier measures the outcome. \label{fig:simulation.2a}} @@ -97,19 +79,13 @@ \node[observed={white}{gray!40}, above=of y] (x) {$X$}; \node[observed, right=of y] (w) {$W$}; -% \node[unobserved, above=of w] (k) {$K$}; + \node[observed,right=of x] (z) {$Z$}; -% \node[residual,below=of y] (e) {$\varepsilon$}; -% \node[residual,below=of w] (xi) {$\xi$}; \draw[-] (x) -- (y); \draw[-] (z) -- (w); \draw[-] (y) -- (w); \draw[-] (x) -- (z); -% \draw[-] (k) -- (w); \draw[-] (z) -- (y); -% \draw[-] (z) -- (k); -% \draw[-] (y) -- (xi); -% \draw[-] (w) -- (xi); \end{tikzpicture} \caption{In \emph{Simulation 2b}, the edge connecting $W$ and $Z$ signifies that the predictions $W$ are not conditionally independent of $Z$ given $Y$, indicating systematic misclassification. \label{fig:simulation.2b}} \end{subfigure} @@ -120,7 +96,6 @@ \matrix [draw, below, font=\small, align=center, column sep=2\pgflinewidth, inner sep=0.4em, outer sep=0em, nodes={align=center, anchor=center}] at (current bounding box.south){ \node[observed,label=right:observed] {}; \\ \node[unobserved,label=right:automatically classified]{}; \\ -% \node[residual,label=right:error term]{}; \\ }; \end{tikzpicture} \end{subfigure} diff --git a/flowchart_recommendations.tex b/flowchart_recommendations.tex index 27d92a9..3513a68 100644 --- a/flowchart_recommendations.tex +++ b/flowchart_recommendations.tex @@ -61,11 +61,8 @@ \node[outcome box] (outcome_systematic_iv) [below =3.8in of report_manual] {Use MLA or MI.}; \node[outcome box] (outcome_nonsystematic_iv) [above =3ex of outcome_systematic_iv] {Use GMM or MLA.}; - % \node[outcome box] (outcome_systematic_dv) [below =2in of outcome_nonsystematic_iv] {Use MLA.}; - \node[outcome box] (outcome_dv) [below =3ex of outcome_systematic_iv] {Use MLA.}; - % & \node[] (iv_1) {Independent variable}; & \node[decision box] (dv_1) {Dependent variable}; \\ \draw[myarrow] (manual.south) to [controls=+(280:2) and +(165:1.5)] (report_manual.west) { node [mylabel, pos=0.5, yshift=-5ex, xshift=5.5in] {Found convincing evidence}}; @@ -82,14 +79,9 @@ \draw[myarrow] (dependent)+(8ex,2ex) to [xshift=15ex,yshift=-4ex, controls=+(80:0.8) and +(170:1)] (outcome_dv.west); - % \draw[myarrow] (systematic)to [xshift=15ex,yshift=-4ex, controls=+(270:1) and +(180:1)] (outcome_systematic_dv.west) {node [mylabel] { Dependent variable}}; \draw[myarrow] (correct) to (report); -% \draw[myarrow] (convincing_evidence.east) to (report_manual); -% \draw[myarrow] (convincing_evidence.south) to (test_systematic); -% \draw[myarrow] (test_systematic) to (iv_1); - \end{tikzpicture} \vspace{1ex} % \end{figure}