Merge branch 'master' of code.communitydata.science:cdsc_examples_repository

2022-09-22 16:58:43 -07:00 · 2022-09-22 16:58:43 -07:00 · 50edd8c40a
commit 50edd8c40a
parent a471417984 3041126fcd
63 changed files with 26846 additions and 0 deletions
--- a/Ecology_REU_Flyer.docx
+++ b/Ecology_REU_Flyer.docx
--- a/dissertation_proposals/teblunthuis_ecology.pdf
+++ b/dissertation_proposals/teblunthuis_ecology.pdf
--- a/dissertations/nathante_uw_2021/ETD_version.tex
+++ b/dissertations/nathante_uw_2021/ETD_version.tex
@ -0,0 +1,321 @@
+ \documentclass[12pt]{memoir}
+
+\usepackage{cdsc-memoir}
+% there are two chapter styles: cdsc-article and cdsc-memo
+% memo assumes that you remove the "\\" and the email address from the
+% \author field below as well as that you will comment out the
+% \published tag
+\chapterstyle{cdsc-article}
+
+\usepackage[utf8]{inputenc}
+\usepackage{wrapfig}
+\usepackage[T1]{fontenc}
+\usepackage{textcomp}
+% \usepackage[garamond]{mathdesign}
+\let\circledS\undefined
+
+\usepackage[letterpaper,left=1in,right=1in,top=1in,bottom=1in]{geometry}
+
+% packages i use in essentially every document
+\usepackage{graphicx}
+\usepackage{enumerate}
+\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX
+\newcommand{\maxwidth}{\linewidth}
+% packages i use in many documents but leave off by default
+\usepackage{amsmath, amsthm, amssymb}
+\usepackage{dcolumn}
+% \usepackage{endfloat}
+
+% import and customize urls
+\usepackage[usenames,dvipsnames]{color}
+\usepackage[breaklinks]{hyperref}
+
+\hypersetup{colorlinks=true, linkcolor=Black, citecolor=Black, filecolor=Blue,
+    urlcolor=Blue, unicode=true}
+
+\usepackage{xcolor}
+\definecolor{shadecolor}{rgb}{.97, .97, .97}
+\definecolor{messagecolor}{rgb}{0, 0, 0}
+\definecolor{warningcolor}{rgb}{1, 0, 1}
+\definecolor{errorcolor}{rgb}{1, 0, 0}
+\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345}
+
+\definecolor{mygreen}{HTML}{43bf71}
+
+% list of footnote symbols for \thanks{}
+\makeatletter
+\renewcommand*{\@fnsymbol}[1]{\ensuremath{\ifcase#1\or *\or \dagger\or \ddagger\or
+ \mathsection\or \mathparagraph\or \|\or **\or \dagger\dagger
+  \or \ddagger\ddagger \else\@ctrerr\fi}}
+\makeatother
+\newcommand*\samethanks[1][\value{footnote}]{\footnotemark[#1]}
+
+% add bibliographic stuff 
+\usepackage[american]{babel}
+\usepackage{csquotes}
+\usepackage[natbib=true, style=apa, backend=biber]{biblatex}
+%\addbibresource{ecological_models.bib}
+%\addbibresource{ch1_intro.bib}
+\addbibresource{articlequality.bib}
+\addbibresource{equalogy_refs.bib}
+\addbibresource{refs.bib}
+\addbibresource{ReadingTime.bib}
+\addbibresource{ores_fairness.bib}
+\DeclareLanguageMapping{american}{american-apa}
+
+\defbibheading{secbib}[\bibname]{%
+  \section*{#1}%
+  \markboth{#1}{#1}%
+  \baselineskip 14.2pt%
+  \prebibhook}
+
+\def\citepos#1{\citeauthor{#1}'s (\citeyear{#1})}
+\def\citespos#1{\citeauthor{#1}' (\citeyear{#1})}
+
+
+
+% memoir function to take out of the space out of the whitespace lists
+\firmlists
+
+% \newcommand*\abstract[1]{
+
+% LATEX NOTE: these lines will import vc stuff after running `make vc` which
+% will add version control information to the bottom of each page. This can be
+% useful for keeping track of which version of a document somebody has:
+% \input{vc}
+% \pagestyle{cdsc-page-git}
+
+% LATEX NOTE: this alternative line will just input a timestamp at the
+% build process, useful for Overleaf
+% \pagestyle{cdsc-page-overleaf}
+
+% \definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345}
+% \newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}}%
+% \newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}}%
+% \newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}}%
+% \newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}%
+% \newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}}%
+% \newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}}%
+% \newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}}%
+% \newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}}%
+% \newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}}%
+% \let\hlipl\hlkwb
+
+% \usepackage{framed}
+% \makeatletter
+% \newenvironment{kframe}{%
+%  \def\at@end@of@kframe{}%
+%  \ifinner\ifhmode%
+%   \def\at@end@of@kframe{\end{minipage}}%
+%   \begin{minipage}{\columnwidth}%
+%  \fi\fi%
+%  \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep
+%  \colorbox{shadecolor}{##1}\hskip-\fboxsep
+%      % There is no \\@totalrightmargin, so:
+%      \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}%
+%  \MakeFramed {\advance\hsize-\width
+%    \@totalleftmargin\z@ \linewidth\hsize
+%    \@setminipage}}%
+%  {\par\unskip\endMakeFramed%
+%  \at@end@of@kframe}
+% \makeatother
+
+% \definecolor{shadecolor}{rgb}{.97, .97, .97}
+% \definecolor{messagecolor}{rgb}{0, 0, 0}
+% \definecolor{warningcolor}{rgb}{1, 0, 1}
+% \definecolor{errorcolor}{rgb}{1, 0, 0}
+% \newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX
+
+\usepackage{alltt}
+
+\definecolor{c77a1d2}{RGB}{119,161,210}
+\definecolor{bf9837}{RGB}{191,152,55}
+\definecolor{cc0c0c0}{RGB}{192,192,192}
+\def \globalscale {0.2}
+
+\definecolor{mycomp}{RGB}{250,198,49}
+\definecolor{mymut}{RGB}{13,8,135}
+
+
+%\usepackage{wrapfig}
+\usepackage{tikz}
+\usepackage{booktabs}
+\usepackage{multicol}
+
+% TODO make table of contents HERE
+
+% TODO add Acknowledgements HERE
+
+
+% \begin{acks}
+
+% \end{acks}
+
+
+% TODO add Dedication HERE
+
+% Add Chapter Titles
+
+\usepackage{subcaption}
+\def\citepos#1{{\hypersetup{citecolor=black}\citeauthor{#1}}'s \citep{#1}}
+\def\citespos#1{{\hypersetup{citecolor=black}\citeauthor{#1}}' \citep{#1}}
+\let\oldciteauthor=\citeauthor
+\def\citeauthor#1{{\hypersetup{citecolor=black}\oldciteauthor{#1}}}
+%%
+\usepackage[htt]{hyphenat}
+\usepackage{commath}
+\usepackage{mathtools}
+
+\renewcommand{\widetilde}[1]{\mathbin{%
+    \stackrel{\sim}{\smash{#1} \rule{0pt}{1.15ex}}%
+    }}
+
+
+\let\oldnorm\norm   % <-- Store original \norm as \oldnorm
+\let\norm\undefined % <-- "Undefine" \norm
+\DeclarePairedDelimiter\norm{\lVert}{\rVert}
+
+%% end of the preamble, start of the body of the document source.
+\hyphenation{shit-gun-con-trol-lers-say com-mer-cial-real-est-ate real-est-ate sub-red-dit sub-red-dits real-est-ate-in-vest-ing fin-an-cial-in-de-pen-dence in-fin-ite-war-fare vint-age-aud-io rus-sia-la-go march-ag-ainst-trump}
+\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
+
+\def\Slash{\slash\hspace{0pt}}
+
+% \chapterstyle{thatcher}
+
+% this one is also good and more formal
+% \chapterstyle{thatcher}
+
+% \renewcommand*{\chapterheadstart}{\begingroup
+%   \vspace*{\beforechapskip}%
+%   \begin{adjustwidth}{}{-\chapindent}%
+%   \hrulefill
+%   \smash{\rule{0.4pt}{15mm}}
+%   \end{adjustwidth}\endgroup}
+\usepackage{longtable}
+\usepackage{color, colortbl}
+\definecolor{lavenderblue}{rgb}{0.9, 0.9, 0.98}
+\usepackage{graphicx}
+\usepackage{multirow}
+\usepackage{svg}
+\usepackage{afterpage}
+
+%% magic command to not add links on \citeauthor
+\usepackage{etoolbox}
+\makeatletter
+\pretocmd{\NAT@citexnum}{\@ifnum{\NAT@ctype>\z@}{\let\NAT@hyper@\relax}{}}{}{}
+\makeatother
+
+\DeclareMathOperator*{\argmin}{arg\,min} % thin space, limits underneath in displays
+\DeclareMathOperator*{\argmax}{arg\,max} % thin space, limits underneath in displays
+
+
+\begin{document}
+\tableofcontents
+\listoffigures
+\listoftables
+\chapter*[Acknowledgments]{Acknowledgments}
+
+I am grateful to the many academic friends, colleagues, and mentors who have cultivated my intellectual development, helped me work on these ideas, and in every other way made possible my success. 
+In particular I would like to thank members of the  Community Data Science Collective and Aaron Shaw, Sohyeon Hwang, Jeremy Foote,  Carl Colglazier, Floor Fiers, Sejal Khatri, Sefania Druga, Nicholas Vincent, and Kaylea Champion in particular for their helpful feedback on parts of this work.
+Also thanks to Mako and Aaron for their innovation, dedication and care in organizing this very special research group.
+I am also grateful to my collaborators I have not yet mentioned: Isabella Brown, Laura (Alia) Levi, Nicole McGinnis, Tilman Bayer, Olga Vasileva, and Aaron Halfaker.
+Special thanks to Daryn McElroy for her work to externally validate our clusters.
+Thanks to Mark Kott for his excellent course on mathematical ecology which inspired an important turning point in the direction of this work and to Carmen Gonzalez and Matthew Powers' for their fantastic course on fieldwork research methods. The importance of this education in qualitative research to this work suprised me, but I doubt it would suprise them.   
+I am also grateful to the organizers and participants in the social computing reading group (SCRG) at the University of Washington.  My participation in this reading group has been invaluable to any ability I have to make contributions to social computing or HCI.
+I owe special gratitude to my 20 interview participants for their time and knowledge.
+I am thankful to the organizers and members of UAW Local 4121 for their strength and solidarity.  
+Thanks to Jason Baumgartner and pushshift.io for the Reddit data archive.
+This work was made possible by generous financial support from the National science foundation  grants IIS-1908850 and IIS-1910202 and GRFP2016220885 and was facilitated through the use of the advanced computational infrastructure provided by the Hyak supercomputer system at the University of Washington.
+
+\chapter*[Dedication]{Dedication}
+To Amanda, my dear full mutualist.
+
+
+\chapter*[Preface to Chapter 1]{Preface to Chapter 1}
+Several paragraphs in beginning of the following chapter adapt from text I wrote for a grant proposal submitted to the National Science Foundation (\url{https://www.nsf.gov/awardsearch/showAward?AWD_ID=1910202}, 1910202)
+\begin{refsection}
+\chapter[An Ecology of Digital Affiliation]{Introduction: An Ecology of Digital Affiliation}
+\input{ch1_intro.tex}
+% \end{refsection}
+% \begin{refsection}
+\chapter*[Preface to Chapter 2]{Preface to Chapter 2}
+The following chapter is a collaborative work with Benjamin Mako Hill.
+
+\noindent It was honored with a Top Paper award from the Computational Methods Division of the International Communication Association's 2021 annual meeting.  An early version of this chapter was presented at the 2020 International Conference for Computational Social Science (IC2S2 2020).  
+\chapter[Identifying Competition and Mutualism]{Identifying Competition and Mutualism Between Online Groups}
+\input{ch2_identifying.tex}
+% \end{refsection} 
+% \begin{refsection}
+\chapter*[Preface to Chapter 3]{Preface to Chapter 3}
+
+An important finding from Chapter 2 is that mutualism is much more common than competition among overlapping subreddits.  This finding was also surprising because ecological theory and prior results in social computing suggest that greater niche overlaps result in stronger competition.  Furthermore, theories of organizational ecology were insufficient for explaining the reasons why overlapping online communities exist in the first place. Therefore, the qualitative investigation presented in Chapter 3 provided important explanation and validation of the quantitative finding of widespread mutualism in terms of the experiences and understandings of active participants in overlapping subreddits.  If the findings from Chapter 3 had been known in advance of Chapter 2's study, Chapter 2 would have been more likely to anticipate widespread mutualism and may have been designed to explain it.  
+
+Because Chapters 2, 3, and 4 are each written as stand-alone articles, some parts of the background section of Chapter 3, most notably the first 3 paragraphs of §3.2 makes some of the same points as the background section of Chapter 2.
+Also, the interview recruitment process uses  an earlier version of clustering algorithm from Chapter 2 (before it was improved during a revise and resubmit process). The second paragraph of §3.5 summarizes the clustering procedure.  Readers of Chapter 2 may quickly pass over the these paragraphs.
+
+This chapter is a collaborative work with Charles Kiene, Isabella Brown, Laura (Alia) Levi, Nicole McGinnis, and Benjamin Mako Hill and is under review in Proceedings of the ACM on Human-Computer Interaction: Computer Supported Cooperative Work.
+
+  \chapter[No Community Can Do Everything]{No Community Can Do Everything: Why People Participate in Similar Online Communities}
+\input{equalogy.tex}
+% \end{refsection} 
+% \begin{refsection}
+\chapter*[Preface to Chapter 4]{Preface to Chapter 4}
+As was the case with Chapter 3, Chapter 4 is written as a stand-alone article building upon Chapter 3.  It repeats some of the same motivating points in the first paragraph of §4.1, and the first two paragraphs of §4.2.
+
+This study also reuses the clustering procedure from Chapter 2, but on a larger dataset. The first three paragraphs of §4.3 describe the clustering procedure. Those who have read Chapter 2 may quickly pass over these paragraphs, noting that the sample size, dimensionality of LSI, and the number of clusters are different from Chapter 2. 
+
+\chapter[Dynamics of Ecological Adaptation]{Dynamics of Ecological Adaptation in Online Communities}
+\input{ch4_competitive_exclusion.tex}
+% \end{refsection} 
+% \begin{refsection}
+\chapter[Future Directions]{Future Directions in the Ecology of Online Communities}
+\input{ch5_conclusion.tex}
+\end{refsection}                
+\appendix
+\begin{refsection}
+\chapter*[Preface to Appendix A]{Preface to Appendix A}
+The following appendix is published in the Proceedings of The 17th International Symposium on Open Collaboration.
+
+\chapter[Measuring Article Quality]{Measuring Wikipedia Article Quality in One Dimension by Extending ORES with Ordinal Regression}
+\input{appendix_A_articlequality.tex}
+\end{refsection} 
+\chapter*[Preface to Appendix B]{Preface to Appendix B}
+ The following appendix is a collaborative work with Tilman Bayer and Olga Vasileva and is published in the Proceedings of The 15th International Symposium on Open Collaboration.
+\begin{refsection}
+  \chapter[Dwelling on Wikipedia]{Dwelling on Wikipedia: Investigating time spent by global encyclopedia readers}
+\input{appendix_B_readingtime.tex}
+\end{refsection} 
+\chapter*[Preface to Appendix C]{Preface to Appendix C}
+  The following appendix is a collaborative work with Benjamin Mako Hill and Aaron Halfaker and is published in the Proceedings of ACM on Human-Computer Interaction: Computer Supported Cooperative Work.
+\begin{refsection}
+  \chapter[Effects of Algorithmic Flagging on Fairness]{Effects of Algorithmic Flagging on Fairness: Quasi-experimental Evidence from Wikipedia}
+\input{appendix_C_oresfairness.tex}
+\end{refsection} 
+
+% \appendix
+
+%\renewcommand{\thechapter}{A} after \chapter{Test Appendix}
+
+% \addtocontents{toc}{\setlength\cftchapternumwidth{1em}}
+% \renewcommand\thechapter{}
+
+
+
+% \begin{refsection}
+%   \chapter[Future Directions]{Future Directions in the Ecology of Online Communities}
+% \input{appendix_B.tex}
+% \end{refsection} 
+% \begin{refsection}
+%   \chapter[Future Directions]{Future Directions in the Ecology of Online Communities}
+% \input{appendix_C.tex}
+% \end{refsection} 
+
+
+
+% bibliography here
+% \setcounter{biburlnumpenalty}{9001}
+% \printbibliography[title = {References}, heading=secbib]
+\end{document}
+
+%  LocalWords:  
--- a/dissertations/nathante_uw_2021/Makefile
+++ b/dissertations/nathante_uw_2021/Makefile
@ -0,0 +1,32 @@
+#!/usr/bin/make
+
+all: ETD_version.pdf
+	pdftk copyright_page.pdf title_page.pdf abstract.pdf ETD_version.pdf cat output diss_ecology_of_online_communities.pdf
+
+# use the following section for Rnw/knitr documents
+# all: $(patsubst %.Rnw,%.pdf,$(wildcard *.Rnw))
+# %.tex: %.Rnw
+#	Rscript -e "library(knitr); knit('$<')"
+
+%.pdf: %.tex 
+	latexmk -f -pdf $<
+
+clean: 
+	latexmk -C *.tex
+	rm -f *.tmp *.run.xml
+	rm -f vc
+	rm -f *.bbl
+
+# the following lines are useful for Rnw/knitr
+# rm -rf cache/ figure/
+# rm -f *.tex
+
+viewpdf: all
+	evince *.pdf
+
+vc:	resources/vc-git
+
+pdf: all
+
+.PHONY: clean all
+# .PRECIOUS: %.tex
--- a/dissertations/nathante_uw_2021/ReadingTime.bib
+++ b/dissertations/nathante_uw_2021/ReadingTime.bib
@ -0,0 +1,818 @@
+
+@article{hill_wikipedia_2013,
+  title = {The {{Wikipedia}} Gender Gap Revisited: Characterizing Survey Response Bias with Propensity Score Estimation},
+  volume = {8},
+  shorttitle = {The {{Wikipedia Gender Gap Revisited}}},
+  number = {6},
+  journal = {PLoS ONE},
+  doi = {10.1371/journal.pone.0065782},
+  author = {Hill, Benjamin Mako and Shaw, Aaron},
+  month = jun,
+  year = {2013}
+}
+
+@inproceedings{antin_social_2012,
+  address = {{New York, NY, USA}},
+  series = {{{CHI}} '12},
+  title = {Social Desirability Bias and Self-Reports of Motivation: A Study of {{Amazon Mechanical Turk}} in the {{US}} and {{India}}},
+  isbn = {978-1-4503-1015-4},
+  shorttitle = {Social {{Desirability Bias}} and {{Self}}-Reports of {{Motivation}}},
+  booktitle = {Proceedings of the {{SIGCHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2207676.2208699},
+  author = {Antin, Judd and Shaw, Aaron},
+  year = {2012},
+  keywords = {distributed work,social desirability,motivation,crowdsourcing,amazon mechanical turk},
+  pages = {2925--2934}
+}
+
+@article{preece_reader--leader_2009,
+  title = {The Reader-to-Leader Framework: Motivating Technology-Mediated Social Participation},
+  volume = {1},
+  issn = {1944-3900},
+  shorttitle = {The {{Reader}}-to-{{Leader Framework}}},
+  number = {1},
+  journal = {AIS Transactions on Human-Computer Interaction},
+  author = {Preece, Jennifer and Shneiderman, Ben},
+  year = {2009},
+  pages = {13-32}
+}
+
+@inproceedings{arazy_functional_2015,
+  address = {{New York, NY}},
+  series = {{{CSCW}} '15},
+  title = {Functional Roles and Career Paths in {{Wikipedia}}},
+  isbn = {978-1-4503-2922-4},
+  booktitle = {Proceedings of the 18th {{ACM Conference}} on {{Computer Supported Cooperative Work}} \& {{Social Computing}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2675133.2675257},
+  author = {Arazy, Ofer and Ortega, Felipe and Nov, Oded and Yeo, Lisa and Balila, Adam},
+  year = {2015},
+  keywords = {peer-production,functional roles,role transitions,ORGANIZATIONAL structure,wikipedia},
+  pages = {1092--1105}
+}
+
+@inproceedings{warncke-wang_misalignment_2015,
+  title = {Misalignment {{Between Supply}} and {{Demand}} of {{Quality Content}} in {{Peer Production Communities}}},
+  language = {en},
+  urldate = {2016-08-15},
+  booktitle = {Ninth {{International AAAI Conference}} on {{Web}} and {{Social Media}}},
+  url = {http://www.aaai.org/ocs/index.php/ICWSM/ICWSM15/paper/view/10591},
+  author = {{Warncke-Wang}, Morten and Ranjan, Vivek and Terveen, Loren and Hecht, Brent},
+  month = apr,
+  year = {2015}
+}
+
+@article{stvilia_issues_2009,
+  title = {Issues of Cross-Contextual Information Quality Evaluation\textemdash{{The}} Case of {{Arabic}}, {{English}}, and {{Korean Wikipedias}}},
+  volume = {31},
+  issn = {07408188},
+  language = {en},
+  number = {4},
+  journal = {Library \& Information Science Research},
+  doi = {10.1016/j.lisr.2009.07.005},
+  author = {Stvilia, Besiki and {Al-Faraj}, Abdullah and Yi, Yong Jeong},
+  month = dec,
+  year = {2009},
+  pages = {232-239}
+}
+
+@article{johnson_emergence_2014,
+  title = {Emergence of Power Laws in Online Communities: {{The}} Role of Social Mechanisms and Preferential Attachment.},
+  volume = {38},
+  shorttitle = {Emergence of {{Power Laws}} in {{Online Communities}}},
+  number = {3},
+  urldate = {2017-04-26},
+  journal = {Management Information Systems Quarterly},
+  url = {http://aisel.aisnet.org/cgi/viewcontent.cgi?article=3193\&context=misq},
+  author = {Johnson, Steven L. and Faraj, Samer and Kudaravalli, Srinivas},
+  year = {2014},
+  pages = {795--808}
+}
+
+@article{boyd_critical_2012,
+  title = {Critical {{Questions For Big Data}}: {{Provocations}} for a Cultural, Technological, and Scholarly Phenomenon},
+  volume = {15},
+  issn = {1369-118X, 1468-4462},
+  shorttitle = {{{CRITICAL QUESTIONS FOR BIG DATA}}},
+  language = {en},
+  number = {5},
+  journal = {Information, Communication \& Society},
+  doi = {10.1080/1369118X.2012.678878},
+  author = {{boyd}, danah and Crawford, Kate},
+  month = jun,
+  year = {2012},
+  pages = {662-679}
+}
+
+@article{shaw_pipeline_2018,
+  title = {The Pipeline of Online Participation Inequalities: The Case of {{Wikipedia}} Editing},
+  volume = {68},
+  issn = {0021-9916},
+  shorttitle = {The {{Pipeline}} of {{Online Participation Inequalities}}},
+  language = {en},
+  number = {1},
+  journal = {Journal of Communication},
+  doi = {10.1093/joc/jqx003},
+  author = {Shaw, Aaron and Hargittai, Eszter},
+  month = feb,
+  year = {2018},
+  pages = {143-168}
+}
+
+@article{pal_exponentiated_2006,
+  title = {Exponentiated {{Weibull}} Distribution},
+  volume = {66},
+  copyright = {Copyright (c)},
+  issn = {1973-2201},
+  language = {en},
+  number = {2},
+  journal = {Statistica},
+  doi = {10.6092/issn.1973-2201/493},
+  author = {Pal, Manisha and Ali, M. Masoom and Woo, Jungsoo},
+  year = {2006},
+  pages = {139-147}
+}
+
+@article{gupta_exponentiated_2001,
+  title = {Exponentiated {{Exponential Family}}: {{An Alternative}} to {{Gamma}} and {{Weibull Distributions}}},
+  volume = {43},
+  copyright = {\textcopyright{} 2001 WILEY-VCH Verlag Berlin GmbH, Fed. Rep. of Germany},
+  issn = {1521-4036},
+  shorttitle = {Exponentiated {{Exponential Family}}},
+  language = {en},
+  number = {1},
+  journal = {Biometrical Journal},
+  doi = {10.1002/1521-4036(200102)43:1<117::AID-BIMJ117>3.0.CO;2-R},
+  author = {Gupta, Rameshwar D. and Kundu, Debasis},
+  month = feb,
+  year = {2001},
+  keywords = {Fisher Information matrix,Gamma distribution,Hazard rate ordering,Likelihood ratio ordering,Maximum Likelihood Estimator,Stochastic ordering,Weibull distribution},
+  pages = {117-130}
+}
+
+@inproceedings{liu_understanding_2010,
+  address = {{New York, NY, USA}},
+  series = {{{SIGIR}} '10},
+  title = {Understanding {{Web Browsing Behaviors Through Weibull Analysis}} of {{Dwell Time}}},
+  isbn = {978-1-4503-0153-4},
+  booktitle = {Proceedings of the 33rd {{International ACM SIGIR Conference}} on {{Research}} and {{Development}} in {{Information Retrieval}}},
+  publisher = {{ACM}},
+  doi = {10.1145/1835449.1835513},
+  author = {Liu, Chao and White, Ryen W. and Dumais, Susan},
+  year = {2010},
+  keywords = {dwell time,user behaviors,web browsing,Weibull analysis},
+  pages = {379--386}
+}
+
+@article{mitzenmacher_brief_2004,
+  title = {A {{Brief History}} of {{Generative Models}} for {{Power Law}} and {{Lognormal Distributions}}},
+  volume = {1},
+  issn = {1542-7951},
+  number = {2},
+  journal = {Internet Mathematics},
+  doi = {10.1080/15427951.2004.10129088},
+  author = {Mitzenmacher, Michael},
+  month = jan,
+  year = {2004},
+  pages = {226-251}
+}
+
+@inproceedings{miquel-ribe_cultural_2016,
+  address = {{New York, NY, USA}},
+  series = {{{SMSociety}} '16},
+  title = {Cultural {{Identities}} in {{Wikipedias}}},
+  isbn = {978-1-4503-3938-4},
+  booktitle = {Proceedings of the 7th 2016 {{International Conference}} on {{Social Media}} \& {{Society}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2930971.2930996},
+  author = {{Miquel-Rib{\'e}}, Marc and Laniado, David},
+  year = {2016},
+  keywords = {Wikipedia,Online Communities,Analytics \& Data Mining,Cross-cultural studies,Cultural Identity},
+  pages = {24:1--24:10}
+}
+
+@inproceedings{lehmann_reader_2014,
+  address = {{New York, NY, USA}},
+  series = {{{HT}} '14},
+  title = {Reader {{Preferences}} and {{Behavior}} on {{Wikipedia}}},
+  isbn = {978-1-4503-2954-5},
+  booktitle = {Proceedings of the 25th {{ACM Conference}} on {{Hypertext}} and {{Social Media}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2631775.2631805},
+  author = {Lehmann, Janette and {M{\"u}ller-Birn}, Claudia and Laniado, David and Lalmas, Mounia and Kaltenbrunner, Andreas},
+  year = {2014},
+  keywords = {article quality,reader,wikipedia,human factors,editor,engagement,measurement,reading behavior,reading interest},
+  pages = {88--97}
+}
+
+@article{baliamounelutz_analysis_2003,
+  title = {An Analysis of the Determinants and Effects of {{ICT}} Diffusion in Developing Countries},
+  volume = {10},
+  copyright = {Copyright \textcopyright{} 2003 IOS Press},
+  issn = {1554-0170},
+  language = {en},
+  number = {3},
+  journal = {Information Technology for Development},
+  doi = {10.1002/itdj.1590100303},
+  author = {Baliamoune-Lutz, Mina},
+  month = jun,
+  year = {2003},
+  pages = {151-169}
+}
+
+@article{pearce_digital_2013,
+  title = {Digital {{Divides From Access}} to {{Activities}}: {{Comparing Mobile}} and {{Personal Computer Internet Users}}},
+  volume = {63},
+  copyright = {\textcopyright{} 2013 International Communication Association},
+  issn = {1460-2466},
+  shorttitle = {Digital {{Divides From Access}} to {{Activities}}},
+  language = {en},
+  number = {4},
+  journal = {Journal of Communication},
+  doi = {10.1111/jcom.12045},
+  author = {Pearce, Katy E. and Rice, Ronald E.},
+  month = aug,
+  year = {2013},
+  pages = {721-744}
+}
+
+@article{marler_mobile_2018,
+  title = {Mobile Phones and Inequality: {{Findings}}, Trends, and Future Directions},
+  volume = {20},
+  issn = {1461-4448},
+  shorttitle = {Mobile Phones and Inequality},
+  language = {en},
+  number = {9},
+  journal = {New Media \& Society},
+  doi = {10.1177/1461444818765154},
+  author = {Marler, Will},
+  month = sep,
+  year = {2018},
+  pages = {3498-3520}
+}
+
+@article{asadi_motivating_2013,
+  title = {Motivating and Discouraging Factors for {{Wikipedians}}: The Case Study of {{Persian Wikipedia}}},
+  volume = {62},
+  issn = {0024-2535},
+  shorttitle = {Motivating and Discouraging Factors for {{Wikipedians}}},
+  number = {4/5},
+  journal = {Library Review},
+  doi = {10.1108/LR-10-2012-0114},
+  author = {Asadi, Saeid and Ghafghazi, Shadi and R. Jamali, Hamid},
+  month = jul,
+  year = {2013},
+  pages = {237-252}
+}
+
+@article{ojanpera_engagement_2017,
+  title = {Engagement in the {{Knowledge Economy}}: {{Regional Patterns}} of {{Content Creation}} with a {{Focus}} on {{Sub}}-{{Saharan Africa}}},
+  volume = {13},
+  issn = {1544-7529},
+  shorttitle = {Engagement in the {{Knowledge Economy}}},
+  language = {en},
+  number = {0},
+  urldate = {2018-10-22},
+  journal = {Information Technologies \& International Development},
+  url = {https://itidjournal.org/index.php/itid/article/view/1479},
+  author = {Ojanper{\"a}, Sanna and Graham, Mark and Straumann, Ralph and Sabbata, Stefano De and Zook, Matthew},
+  month = mar,
+  year = {2017},
+  keywords = {digital divide,domain registrations,geographies of knowledge,GitHub,information geographies,international development},
+  pages = {19}
+}
+
+@article{he_the_tower_of_babel.jpg:_nodate,
+  title = {The\_{{Tower}}\_of\_{{Babel}}.Jpg: {{Diversity}} of {{Visual Encyclopedic Knowledge Across Wikipedia Language Editions}}},
+  language = {en},
+  author = {He, Shiqing and Lin, Allen Yilun and Adar, Eytan and Hecht, Brent},
+  pages = {10}
+}
+
+@inproceedings{halfaker_user_2015,
+  address = {{Republic and Canton of Geneva, Switzerland}},
+  series = {{{WWW}} '15},
+  title = {User {{Session Identification Based}} on {{Strong Regularities}} in {{Inter}}-Activity {{Time}}},
+  isbn = {978-1-4503-3469-3},
+  booktitle = {Proceedings of the 24th {{International Conference}} on {{World Wide Web}}},
+  publisher = {{International World Wide Web Conferences Steering Committee}},
+  doi = {10.1145/2736277.2741117},
+  author = {Halfaker, Aaron and Keyes, Os and Kluver, Daniel and {Thebault-Spieker}, Jacob and Nguyen, Tien and Shores, Kenneth and Uduwage, Anuradha and {Warncke-Wang}, Morten},
+  year = {2015},
+  keywords = {activity,modeling,metrics,analytics,human behavior,regularities,user session},
+  pages = {410--418}
+}
+
+@article{kocielnik_reciprocity_2018,
+  title = {Reciprocity and {{Donation}}: {{How Article Topic}}, {{Quality}} and {{Dwell Time Predict Banner Donation}} on {{Wikipedia}}},
+  volume = {2},
+  issn = {25730142},
+  shorttitle = {Reciprocity and {{Donation}}},
+  language = {en},
+  number = {CSCW},
+  journal = {Proceedings of the ACM on Human-Computer Interaction},
+  doi = {10.1145/3274360},
+  author = {Kocielnik, Rafal and Keyes, Os and Morgan, Jonathan T. and Taraborelli, Dario and McDonald, David W. and Hsieh, Gary},
+  month = nov,
+  year = {2018},
+  pages = {1-20}
+}
+
+@article{kiesler_response_1986,
+  title = {Response {{Effects}} in the {{Electronic Survey}}},
+  volume = {50},
+  issn = {0033-362X},
+  language = {en},
+  number = {3},
+  journal = {Public Opinion Quarterly},
+  doi = {10.1086/268992},
+  author = {Kiesler, Sara and Sproull, Lee S.},
+  month = jan,
+  year = {1986},
+  pages = {402-413}
+}
+
+@article{phillips_effects_1972,
+  title = {Some {{Effects}} of "{{Social Desirability}}" in {{Survey Studies}}},
+  volume = {77},
+  issn = {0002-9602},
+  number = {5},
+  journal = {American Journal of Sociology},
+  doi = {10.1086/225231},
+  author = {Phillips, Derek L. and Clancy, Kevin J.},
+  month = mar,
+  year = {1972},
+  pages = {921-940}
+}
+
+@article{clauset_power-law_2009,
+  title = {Power-{{Law Distributions}} in {{Empirical Data}}},
+  volume = {51},
+  issn = {0036-1445},
+  number = {4},
+  journal = {SIAM Review},
+  doi = {10.1137/070710111},
+  author = {Clauset, A. and Shalizi, C. and Newman, M.},
+  month = nov,
+  year = {2009},
+  pages = {661-703}
+}
+
+@article{stumpf_critical_2012,
+  title = {Critical {{Truths About Power Laws}}},
+  volume = {335},
+  copyright = {Copyright \textcopyright{} 2012, American Association for the Advancement of Science},
+  issn = {0036-8075, 1095-9203},
+  language = {en},
+  number = {6069},
+  journal = {Science},
+  doi = {10.1126/science.1216142},
+  author = {Stumpf, Michael P. H. and Porter, Mason A.},
+  month = feb,
+  year = {2012},
+  pages = {665-666},
+  pmid = {22323807}
+}
+
+@article{pepinsky_visual_2018,
+  title = {Visual Heuristics for Marginal Effects Plots},
+  volume = {5},
+  issn = {2053-1680},
+  language = {en},
+  number = {1},
+  journal = {Research \& Politics},
+  doi = {10.1177/2053168018756668},
+  author = {Pepinsky, Thomas B.},
+  month = jan,
+  year = {2018},
+  pages = {2053168018756668}
+}
+
+@inproceedings{singer_why_2017,
+  archivePrefix = {arXiv},
+  title = {Why {{We Read Wikipedia}}},
+  language = {en},
+  booktitle = {Proceedings of the 26th {{International Conference}} on {{World Wide Web}} - {{WWW}} '17},
+  doi = {10.1145/3038912.3052716},
+  author = {Singer, Philipp and Lemmerich, Florian and West, Robert and Zia, Leila and Wulczyn, Ellery and Strohmaier, Markus and Leskovec, Jure},
+  year = {2017},
+  keywords = {Computer Science - Digital Libraries,Computer Science - Human-Computer Interaction,Computer Science - Social and Information Networks},
+  pages = {1591-1600}
+}
+
+@inproceedings{lemmerich_why_2019,
+  address = {{New York, NY, USA}},
+  series = {{{WSDM}} '19},
+  title = {Why the {{World Reads Wikipedia}}: {{Beyond English Speakers}}},
+  isbn = {978-1-4503-5940-5},
+  shorttitle = {Why the {{World Reads Wikipedia}}},
+  booktitle = {Proceedings of the {{Twelfth ACM International Conference}} on {{Web Search}} and {{Data Mining}}},
+  publisher = {{ACM}},
+  doi = {10.1145/3289600.3291021},
+  author = {Lemmerich, Florian and {S{\'a}ez-Trumper}, Diego and West, Robert and Zia, Leila},
+  year = {2019},
+  keywords = {motivation,survey,wikipedia,cross-cultural analysis,log analysis,multi-language},
+  pages = {618--626}
+}
+
+@inproceedings{paranjape_improving_2016,
+  address = {{New York, NY, USA}},
+  series = {{{WSDM}} '16},
+  title = {Improving {{Website Hyperlink Structure Using Server Logs}}},
+  isbn = {978-1-4503-3716-8},
+  booktitle = {Proceedings of the {{Ninth ACM International Conference}} on {{Web Search}} and {{Data Mining}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2835776.2835832},
+  author = {Paranjape, Ashwin and West, Robert and Zia, Leila and Leskovec, Jure},
+  year = {2016},
+  keywords = {log analysis,browsing,link prediction,navigation},
+  pages = {615--624}
+}
+
+@inproceedings{yi_beyond_2014,
+  address = {{New York, NY, USA}},
+  series = {{{RecSys}} '14},
+  title = {Beyond {{Clicks}}: {{Dwell Time}} for {{Personalization}}},
+  isbn = {978-1-4503-2668-1},
+  shorttitle = {Beyond {{Clicks}}},
+  booktitle = {Proceedings of the 8th {{ACM Conference}} on {{Recommender Systems}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2645710.2645724},
+  author = {Yi, Xing and Hong, Liangjie and Zhong, Erheng and Liu, Nanthan Nan and Rajan, Suju},
+  year = {2014},
+  keywords = {collaborative filtering,dwell time,content recommendation,learning to rank,personalization},
+  pages = {113--120}
+}
+
+@inproceedings{balachandran_modeling_2014,
+  address = {{New York, NY, USA}},
+  series = {{{MobiCom}} '14},
+  title = {Modeling {{Web Quality}}-of-Experience on {{Cellular Networks}}},
+  isbn = {978-1-4503-2783-1},
+  booktitle = {Proceedings of the 20th {{Annual International Conference}} on {{Mobile Computing}} and {{Networking}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2639108.2639137},
+  author = {Balachandran, Athula and Aggarwal, Vaneet and Halepovic, Emir and Pang, Jeffrey and Seshan, Srinivasan and Venkataraman, Shobha and Yan, He},
+  year = {2014},
+  keywords = {performance,web browsing,cellular network,quality of experience (qoe)},
+  pages = {213--224}
+}
+
+@inproceedings{yin_silence_2013,
+  address = {{New York, NY, USA}},
+  series = {{{KDD}} '13},
+  title = {Silence Is {{Also Evidence}}: {{Interpreting Dwell Time}} for {{Recommendation}} from {{Psychological Perspective}}},
+  isbn = {978-1-4503-2174-7},
+  shorttitle = {Silence Is {{Also Evidence}}},
+  booktitle = {Proceedings of the 19th {{ACM SIGKDD International Conference}} on {{Knowledge Discovery}} and {{Data Mining}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2487575.2487663},
+  author = {Yin, Peifeng and Luo, Ping and Lee, Wang-Chien and Wang, Min},
+  year = {2013},
+  keywords = {recommendation,dwell time,psychological},
+  pages = {989--997}
+}
+
+@article{soler-adillon_wikipedia_2017,
+  title = {Wikipedia Access and Contribution: {{Language}} Choice in Multilingual Communities . {{A}} Case Study},
+  volume = {0},
+  copyright = {Copyright (c) 2017 Joan Soler-Adillon, Pere Freixa},
+  issn = {2340-5236},
+  shorttitle = {Wikipedia Access and Contribution},
+  language = {en},
+  number = {57},
+  journal = {An{\`a}lisi},
+  doi = {10.5565/rev/analisi.3109},
+  author = {{Soler-Adillon}, Joan and Freixa, Pere},
+  month = dec,
+  year = {2017},
+  keywords = {literacy,Internet,internet,Wikipedia,Viquipèdia,alfabetització,alfabetización,competence,competencia,competència,cultura digital,digital culture,estudiantes,estudiants,informació,información,information,linguistic choice,multilingualism,multilingüisme,multilingüismo,opció lingüística,opción lingüística,students,universidad,universitat,university},
+  pages = {63-80}
+}
+
+@article{graham_warped_2008,
+  title = {Warped {{Geographies}} of {{Development}}: {{The Internet}} and {{Theories}} of {{Economic Development}}},
+  volume = {2},
+  copyright = {\textcopyright{} 2008 The Author. Journal Compilation \textcopyright{} 2008 Blackwell Publishing Ltd},
+  issn = {1749-8198},
+  shorttitle = {Warped {{Geographies}} of {{Development}}},
+  language = {en},
+  number = {3},
+  journal = {Geography Compass},
+  doi = {10.1111/j.1749-8198.2008.00093.x},
+  author = {Graham, Mark},
+  year = {2008},
+  pages = {771-789}
+}
+
+@article{graham_uneven_2014,
+  title = {Uneven {{Geographies}} of {{User}}-{{Generated Information}}: {{Patterns}} of {{Increasing Informational Poverty}}},
+  volume = {104},
+  issn = {0004-5608},
+  shorttitle = {Uneven {{Geographies}} of {{User}}-{{Generated Information}}},
+  number = {4},
+  journal = {Annals of the Association of American Geographers},
+  doi = {10.1080/00045608.2014.910087},
+  author = {Graham, Mark and Hogan, Bernie and Straumann, Ralph K. and Medhat, Ahmed},
+  month = jul,
+  year = {2014},
+  keywords = {representation,Wikipedia,geographies of knowledge,geografía de la Internet,geografías del conocimiento,geoweb,Internet geography,representación,互联网地理,再现,地理网络,知识地理,维基百科},
+  pages = {746-764}
+}
+
+@article{graham_geography_2013,
+  title = {Geography and the Future of Big Data, Big Data and the Future of Geography},
+  volume = {3},
+  issn = {2043-8206},
+  language = {en},
+  number = {3},
+  journal = {Dialogues in Human Geography},
+  doi = {10.1177/2043820613513121},
+  author = {Graham, Mark and Shelton, Taylor},
+  month = nov,
+  year = {2013},
+  pages = {255-261}
+}
+
+@article{fiesler_participant_2018,
+  title = {``{{Participant}}'' {{Perceptions}} of {{Twitter Research Ethics}}},
+  volume = {4},
+  issn = {2056-3051},
+  language = {en},
+  number = {1},
+  journal = {Social Media + Society},
+  doi = {10.1177/2056305118763366},
+  author = {Fiesler, Casey and Proferes, Nicholas},
+  month = jan,
+  year = {2018},
+  pages = {2056305118763366}
+}
+
+@article{napoli_emerging_2014,
+  title = {The {{Emerging Mobile Internet Underclass}}: {{A Critique}} of {{Mobile Internet Access}}},
+  volume = {30},
+  issn = {0197-2243},
+  shorttitle = {The {{Emerging Mobile Internet Underclass}}},
+  number = {5},
+  journal = {The Information Society},
+  doi = {10.1080/01972243.2014.944726},
+  author = {Napoli, Philip M. and Obar, Jonathan A.},
+  month = oct,
+  year = {2014},
+  keywords = {Internet,digital divide,access,mobile Internet,smartphones},
+  pages = {323-334}
+}
+
+@article{scheerder_determinants_2017,
+  title = {Determinants of {{Internet}} Skills, Uses and Outcomes. {{A}} Systematic Review of the Second- and Third-Level Digital Divide},
+  volume = {34},
+  issn = {0736-5853},
+  number = {8},
+  journal = {Telematics and Informatics},
+  doi = {10.1016/j.tele.2017.07.007},
+  author = {Scheerder, Anique and {van Deursen}, Alexander and {van Dijk}, Jan},
+  month = dec,
+  year = {2017},
+  keywords = {Internet skills,Digital divide,Internet outcomes,Internet use,Systematic literature review},
+  pages = {1607-1624}
+}
+
+@article{buchi_modeling_2016,
+  title = {Modeling the Second-Level Digital Divide: {{A}} Five-Country Study of Social Differences in {{Internet}} Use},
+  volume = {18},
+  issn = {1461-4448},
+  shorttitle = {Modeling the Second-Level Digital Divide},
+  language = {en},
+  number = {11},
+  journal = {New Media \& Society},
+  doi = {10.1177/1461444815604154},
+  author = {B{\"u}chi, Moritz and Just, Natascha and Latzer, Michael},
+  month = dec,
+  year = {2016},
+  pages = {2703-2722}
+}
+
+@article{deursen_compoundness_2017,
+  title = {The Compoundness and Sequentiality of Digital Inequality},
+  volume = {11},
+  copyright = {cc\_by\_nc\_nd},
+  issn = {1932-8036},
+  language = {en},
+  journal = {International Journal of Communication},
+  author = {Deursen, Alexander J. A. M. Van and Helsper, Ellen and Eynon, Rebecca and {van Dijk}, Jan A. G. M.},
+  month = jan,
+  year = {2017},
+  pages = {452-473}
+}
+
+@article{deursen_toward_2015,
+  title = {Toward a {{Multifaceted Model}} of {{Internet Access}} for {{Understanding Digital Divides}}: {{An Empirical Investigation}}},
+  volume = {31},
+  issn = {0197-2243},
+  shorttitle = {Toward a {{Multifaceted Model}} of {{Internet Access}} for {{Understanding Digital Divides}}},
+  number = {5},
+  journal = {The Information Society},
+  doi = {10.1080/01972243.2015.1069770},
+  author = {van Deursen, Alexander J. A. M. and van Dijk, Jan A. G. M.},
+  month = oct,
+  year = {2015},
+  keywords = {digital divide,skills,motivation,Internet access,material access,usage},
+  pages = {379-391}
+}
+
+@article{donner_exploring_2011,
+  title = {Exploring {{Mobile}}-Only {{Internet Use}}: {{Results}} of a {{Training Study}} in {{Urban South Africa}}},
+  volume = {5},
+  copyright = {The  International Journal of Communication  is an academic journal. As such, it is dedicated to the open exchange of information. For this reason, IJoC is freely available to individuals and institutions. Copies of this journal or articles in this journal may be distributed for research or educational purposes free of charge and without permission. However, commercial use of the IJoC website or the articles contained herein is expressly prohibited without the written consent of the editor. Authors who publish in The  International Journal of Communication  will release their articles under the   Creative Commons Attribution Non-Commercial No Derivatives (by-nc-nd) license  . This license allows anyone to copy and distribute the article for non-commercial purposes provided that appropriate attribution is given. For details of the rights authors grants users of their work, see the  "human-readable summary" of the license , with a link to the full license. (Note that "you" refers to a user, not an author, in the summary.) This journal utilizes the  LOCKSS system to create a distributed archiving system among participating libraries and permits those libraries to create permanent archives of the journal for purposes of preservation and restoration. The publisher perpetually authorizes participants in the LOCKSS system to archive and restore our publication through the LOCKSS System for the benefit of all LOCKSS System participants. Specifically participating libraries may:  Collect and preserve currently accessible materials;  Use material consistent with original license terms;  Provide copies to other LOCKSS appliances for purposes of audit and repair.   ~    Fair Use The U.S. Copyright Act of 1976 specifies, in Section 107, the terms of the Fair Use exception: Notwithstanding the provisions of sections 106 and 106A, the fair use of a copyrighted work, including such use by reproduction in copies or phonorecords or by any other means specified by that section, for purposes such as criticism, comment, news reporting, teaching (including multiple copies for classroom use), scholarship, or research, is not an infringement of copyright. In determining whether the use made of a work in any particular case is a fair use the factors to be considered shall include:  the purpose and character of the use, including whether such use is of a commercial nature or is for nonprofit educational purposes;  the nature of the copyrighted work;  the amount and substantiality of the portion used in relation to the copyrighted work as a whole; \&amp;  the effect of the use upon the potential market for or value of the copyrighted work.   The fact that a work is unpublished shall not itself bar a finding of fair use if such finding is made upon consideration of all the above factors. In accord with these provisions, the  International Journal of Communication  believes in the vigorous assertion and defense of Fair Use by scholars engaged in academic research, teaching and non-commercial publishing. Thus, we view the inclusion of ``quotations'' from existing print, visual, audio and audio-visual texts to be appropriate examples of Fair Use, as are reproductions of visual images for the purpose of scholarly analysis. We encourage authors to obtain appropriate permissions to use materials originally produced by others, but do not require such permissions as long as the usage of such materials falls within the boundaries of Fair Use.  The  International Journal of Communication  encourages authors to employ fair use in their scholarly publishing wherever appropriate. Fair use is the right to use unlicensed copyrighted material (whether it is text, images, audio-visual, or other) in your own work, in some circumstances. We consult the  Code of Best Practices in Fair Use for Scholarly Research in Communication , created by the International Communication Association and endorsed by the National Communication Association, and you should too. If you have any questions about whether fair use applies to your uses of copyrighted material (whether it is text, images, audio-visual, or other) in your scholarship, simply include your rationale, grounded in the Best Practices, as a supplementary document with your submission.},
+  issn = {1932-8036},
+  shorttitle = {Exploring {{Mobile}}-Only {{Internet Use}}},
+  language = {en},
+  number = {0},
+  urldate = {2019-03-27},
+  journal = {International Journal of Communication},
+  url = {https://ijoc.org/index.php/ijoc/article/view/750},
+  author = {Donner, Jonathan and Gitau, Shikoh and Marsden, Gary},
+  month = apr,
+  year = {2011},
+  pages = {24}
+}
+
+@article{hargittai_second-level_2002,
+  title = {Second-{{Level Digital Divide}}: {{Differences}} in {{People}}'s {{Online Skills}}},
+  volume = {7},
+  copyright = {Copyright (c)},
+  issn = {13960466},
+  shorttitle = {Second-{{Level Digital Divide}}},
+  language = {en-US},
+  number = {4},
+  journal = {First Monday},
+  doi = {10.5210/fm.v7i4.942},
+  author = {Hargittai, Eszter},
+  month = apr,
+  year = {2002}
+}
+
+@book{stinchcombe_constructing_1987,
+  address = {{Chicago}},
+  title = {Constructing Social Theories},
+  isbn = {978-0-226-77484-8},
+  language = {English},
+  publisher = {{University of Chicago Press}},
+  author = {Stinchcombe, Arthur L},
+  year = {1987},
+  note = {OCLC: 970416061}
+}
+
+@inproceedings{kim_modeling_2014,
+  address = {{New York, NY, USA}},
+  series = {{{WSDM}} '14},
+  title = {Modeling {{Dwell Time}} to {{Predict Click}}-Level {{Satisfaction}}},
+  isbn = {978-1-4503-2351-2},
+  booktitle = {Proceedings of the 7th {{ACM International Conference}} on {{Web Search}} and {{Data Mining}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2556195.2556220},
+  author = {Kim, Youngho and Hassan, Ahmed and White, Ryen W. and Zitouni, Imed},
+  year = {2014},
+  keywords = {user behavior,click satisfaction.,dwell time analysis},
+  pages = {193--202}
+}
+
+@inproceedings{jansen_analysis_2003,
+  address = {{Las Vegas, Nevada}},
+  title = {An {{Analysis}} of {{Web Documents Retrieved}} and {{Viewed}}},
+  language = {en},
+  booktitle = {International {{Conference}} on {{Internet Computing}}},
+  publisher = {{CSREA Press}},
+  author = {Jansen, Bernard J and Spink, Amanda},
+  year = {2003},
+  pages = {65-69}
+}
+
+@misc{davies_mediums_2013,
+  title = {Medium's Metric That Matters: {{Total Time Reading}}},
+  shorttitle = {Medium's Metric That Matters},
+  urldate = {2019-03-30},
+  journal = {Data Lab},
+  url = {https://medium.com/data-lab/mediums-metric-that-matters-total-time-reading-86c4970837d5},
+  author = {Davies, Pete},
+  month = nov,
+  year = {2013}
+}
+
+@article{okoli_wikipedia_2014,
+  title = {Wikipedia in the Eyes of Its Beholders: {{A}} Systematic Review of Scholarly Research on {{Wikipedia}} Readers and Readership},
+  volume = {65},
+  copyright = {\textcopyright{} 2014 ASIS\&T},
+  issn = {2330-1643},
+  shorttitle = {Wikipedia in the Eyes of Its Beholders},
+  language = {en},
+  number = {12},
+  journal = {Journal of the Association for Information Science and Technology},
+  doi = {10.1002/asi.23162},
+  author = {Okoli, Chitu and Mehdi, Mohamad and Mesgari, Mostafa and Nielsen, Finn {\AA}rup and Lanam{\"a}ki, Arto},
+  year = {2014},
+  keywords = {Internet,knowledge,reading},
+  pages = {2381-2403}
+}
+
+@inproceedings{priedhorsky_measuring_2017,
+  address = {{New York, NY, USA}},
+  series = {{{CSCW}} '17},
+  title = {Measuring {{Global Disease}} with {{Wikipedia}}: {{Success}}, {{Failure}}, and a {{Research Agenda}}},
+  isbn = {978-1-4503-4335-0},
+  shorttitle = {Measuring {{Global Disease}} with {{Wikipedia}}},
+  booktitle = {Proceedings of the 2017 {{ACM Conference}} on {{Computer Supported Cooperative Work}} and {{Social Computing}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2998181.2998183},
+  author = {Priedhorsky, Reid and Osthus, Dave and Daughton, Ashlynn R. and Moran, Kelly R. and Generous, Nicholas and Fairchild, Geoffrey and Deshpande, Alina and Del Valle, Sara Y.},
+  year = {2017},
+  keywords = {disease,epidemiology,forecasting,modeling,wikipedia},
+  pages = {1812--1834}
+}
+
+@inproceedings{gorbatai_exploring_2011,
+  address = {{New York, NY, USA}},
+  series = {{{WikiSym}} '11},
+  title = {Exploring {{Underproduction}} in {{Wikipedia}}},
+  isbn = {978-1-4503-0909-7},
+  booktitle = {Proceedings of the 7th {{International Symposium}} on {{Wikis}} and {{Open Collaboration}}},
+  publisher = {{ACM}},
+  doi = {10.1145/2038558.2038595},
+  author = {Gorbat{\^a}i, Andreea D.},
+  year = {2011},
+  keywords = {collective production,social goods,underproduction},
+  pages = {205--206}
+}
+
+@article{bell_extensive_2001,
+  title = {Extensive {{Reading}}: {{Speed}} and {{Comprehension}}},
+  volume = {1},
+  issn = {1533-242X},
+  shorttitle = {Extensive {{Reading}}},
+  language = {en},
+  number = {1},
+  journal = {Reading Matrix: An International Online Journal},
+  author = {Bell, Timothy I.},
+  year = {2001/00/00},
+  keywords = {Comparative Analysis,Foreign Countries,Reading Comprehension,Reading Instruction,Reading Rate,Reading Tests,Scores,Teaching Methods}
+}
+
+@article{bochkarev_average_2012,
+  archivePrefix = {arXiv},
+  primaryClass = {cs},
+  title = {Average Word Length Dynamics as Indicator of Cultural Changes in Society},
+  urldate = {2019-04-01},
+  journal = {arXiv:1208.6109 [cs]},
+  url = {http://arxiv.org/abs/1208.6109},
+  author = {Bochkarev, Vladimir V. and Shevlyakova, Anna V. and Solovyev, Valery D.},
+  month = aug,
+  year = {2012},
+  keywords = {Computer Science - Computation and Language,91F20,J.5}
+}
+
+@article{scheerder_determinants_2017-1,
+  title = {Determinants of {{Internet}} Skills, Uses and Outcomes. {{A}} Systematic Review of the Second- and Third-Level Digital Divide},
+  volume = {34},
+  issn = {0736-5853},
+  number = {8},
+  journal = {Telematics and Informatics},
+  doi = {10.1016/j.tele.2017.07.007},
+  author = {Scheerder, Anique and {van Deursen}, Alexander and {van Dijk}, Jan},
+  month = dec,
+  year = {2017},
+  keywords = {Internet skills,Digital divide,Internet outcomes,Internet use,Systematic literature review},
+  pages = {1607-1624}
+}
+
+@article{willems_equity_2012,
+  title = {Equity Considerations for Open Educational Resources in the Glocalization of Education},
+  volume = {33},
+  issn = {0158-7919},
+  number = {2},
+  journal = {Distance Education},
+  doi = {10.1080/01587919.2012.692051},
+  author = {Willems, Julie and Bossu, Carina},
+  month = aug,
+  year = {2012},
+  keywords = {equity,education,e-inclusion,open educational resources},
+  pages = {185-199}
+}
+
+@article{reagle_gender_2011,
+  title = {Gender {{Bias}} in {{Wikipedia}} and {{Britannica}}},
+  volume = {5},
+  copyright = {The  International Journal of Communication  is an academic journal. As such, it is dedicated to the open exchange of information. For this reason, IJoC is freely available to individuals and institutions. Copies of this journal or articles in this journal may be distributed for research or educational purposes free of charge and without permission. However, commercial use of the IJoC website or the articles contained herein is expressly prohibited without the written consent of the editor. Authors who publish in The  International Journal of Communication  will release their articles under the   Creative Commons Attribution Non-Commercial No Derivatives (by-nc-nd) license  . This license allows anyone to copy and distribute the article for non-commercial purposes provided that appropriate attribution is given. For details of the rights authors grants users of their work, see the  "human-readable summary" of the license , with a link to the full license. (Note that "you" refers to a user, not an author, in the summary.) This journal utilizes the  LOCKSS system to create a distributed archiving system among participating libraries and permits those libraries to create permanent archives of the journal for purposes of preservation and restoration. The publisher perpetually authorizes participants in the LOCKSS system to archive and restore our publication through the LOCKSS System for the benefit of all LOCKSS System participants. Specifically participating libraries may:  Collect and preserve currently accessible materials;  Use material consistent with original license terms;  Provide copies to other LOCKSS appliances for purposes of audit and repair.   ~    Fair Use The U.S. Copyright Act of 1976 specifies, in Section 107, the terms of the Fair Use exception: Notwithstanding the provisions of sections 106 and 106A, the fair use of a copyrighted work, including such use by reproduction in copies or phonorecords or by any other means specified by that section, for purposes such as criticism, comment, news reporting, teaching (including multiple copies for classroom use), scholarship, or research, is not an infringement of copyright. In determining whether the use made of a work in any particular case is a fair use the factors to be considered shall include:  the purpose and character of the use, including whether such use is of a commercial nature or is for nonprofit educational purposes;  the nature of the copyrighted work;  the amount and substantiality of the portion used in relation to the copyrighted work as a whole; \&amp;  the effect of the use upon the potential market for or value of the copyrighted work.   The fact that a work is unpublished shall not itself bar a finding of fair use if such finding is made upon consideration of all the above factors. In accord with these provisions, the  International Journal of Communication  believes in the vigorous assertion and defense of Fair Use by scholars engaged in academic research, teaching and non-commercial publishing. Thus, we view the inclusion of ``quotations'' from existing print, visual, audio and audio-visual texts to be appropriate examples of Fair Use, as are reproductions of visual images for the purpose of scholarly analysis. We encourage authors to obtain appropriate permissions to use materials originally produced by others, but do not require such permissions as long as the usage of such materials falls within the boundaries of Fair Use.  The  International Journal of Communication  encourages authors to employ fair use in their scholarly publishing wherever appropriate. Fair use is the right to use unlicensed copyrighted material (whether it is text, images, audio-visual, or other) in your own work, in some circumstances. We consult the  Code of Best Practices in Fair Use for Scholarly Research in Communication , created by the International Communication Association and endorsed by the National Communication Association, and you should too. If you have any questions about whether fair use applies to your uses of copyrighted material (whether it is text, images, audio-visual, or other) in your scholarship, simply include your rationale, grounded in the Best Practices, as a supplementary document with your submission.},
+  issn = {1932-8036},
+  language = {en},
+  number = {0},
+  urldate = {2019-06-24},
+  journal = {International Journal of Communication},
+  url = {https://ijoc.org/index.php/ijoc/article/view/777},
+  author = {Reagle, Joseph and Rhue, Lauren},
+  month = aug,
+  year = {2011},
+  pages = {21}
+}
+
+
--- a/dissertations/nathante_uw_2021/UWPhDThesis_Template_2013_Updated_11.29.2015.docx
+++ b/dissertations/nathante_uw_2021/UWPhDThesis_Template_2013_Updated_11.29.2015.docx
--- a/dissertations/nathante_uw_2021/abstract.pdf
+++ b/dissertations/nathante_uw_2021/abstract.pdf
--- a/dissertations/nathante_uw_2021/appendix_A_articlequality.tex
+++ b/dissertations/nathante_uw_2021/appendix_A_articlequality.tex
@ -0,0 +1,312 @@
+
+% \baselineskip 24ptn
+
+%%
+%% The "title" command has an optional parameter,
+%% allowing the author to define a "short title" to be used in page headers.
+%% Sneha suggests changing the title suggests it should make reference to ORES.
+%% Abstract 150 words
+\chapterprecishere{
+% Most explanations of changes in online group size focus on internal factors like social structures or design decisions. 
+% do not make the , and render critical questions like “which other groups are a given group's strongest competitors or mutualists?”  unanswerable.
+Organizing complex peer production projects and advancing scientific knowledge of open collaboration each depend on the ability to measure quality.  Article quality ratings on English language Wikipedia have been widely used by both Wikipedia community members and academic researchers for purposes like tracking knowledge gaps and studying how political polarization shapes collaboration. Even so, measuring quality presents many methodological challenges. The most widely used systems use labels on discrete ordinal scales when assessing quality, but such labels can be inconvenient for statistics and machine learning. Prior work handles this by assuming that different levels of quality are ``evenly spaced'' from one another. This assumption runs counter to intuitions about the relative degrees of effort needed to raise Wikipedia encyclopedia articles to different quality levels. Furthermore, models from prior work are fit to datasets that oversample high-quality articles. This limits their accuracy for representative samples of articles or revisions. I describe a technique extending the Wikimedia Foundations' ORES article quality model to address these limitations. My method uses weighted ordinal regression models to construct one-dimensional continuous measures of quality. While scores from my technique and from prior approaches are correlated, my approach improves accuracy for research datasets and provides evidence that the ``evenly spaced'' assumption is unfounded in practice on English Wikipedia. I conclude with recommendations for using quality scores in future research and include the full code, data, and models.
+}
+
+\section{Introduction} \label{sec:introduction}
+% LATEX NOTE: This alphabet below is here so we can measure the line-length of
+% different layouts.  Typesetters suggest that an average line-length of
+% between 45-90 characters and a rule of thumb for typesetting is that you
+% should be able to fit between 2-3 alphabets on one line.  Generally speaking,
+% the shorter the line length, the better -- and the smaller the linespacing
+% can become.  The following line is 3 alphabets (73 characters). 
+
+% Kaylea suggests adding "support learning" to the motivation in reference to how wikiedu uses the ORES quality measures.
+% This first paragraph is very Wikipedia-centric.
+Measuring content quality in peer production projects like Wikipedia is important so projects can learn about themselves and track progress. Measuring quality also helps build confidence that information is accurate and supports monitoring how well an encyclopedia includes diverse subject areas to identify gaps needing attention \citep{redi_taxonomy_2021}. Measuring quality enables tracking and evaluating the progress of subprojects and initiatives organized to fill the gaps \citep{halfaker_interpolating_2017, warncke-wang_success_2015}.   Raising an article to a high standard of quality is a recognized achievement among contributors, so assessing quality can help motivate contributions \citep{ayers_how_2008,forte_why_2005}. In these ways, measuring quality can be of key importance to advancing the priorities of the Wikimedia movement and is also important to other kinds of open collaboration \citep{champion_underproduction_2021}.
+
+Measuring quality also presents methodological and ontological challenges.  How can ``quality'' be conceptualized so that measurement of the goals of a project and the value it produces can be precise and accurate?
+Language editions of Wikipedia, including English, peer produce quality labels that have been useful both for motivating and coordinating project work and for enabling research.
+Epistemic virtues of this approach stem from the community-constructed criteria for assessment and from formalized procedures for third-party evaluation organized by WikiProjects. These systems also have two important limitations: (1) ratings are likely to lag behind changes in article quality, and (2) quality is assessed on a discrete ordinal scale, which violates typical assumptions in statistical analysis. Both limitations are surmountable.
+
+The machine learning framework introduced by \citet{warncke-wang_tell_2013}, further developed by \citet{halfaker_interpolating_2017}, implemented by the Objective Revision Evaluation Service\footnote{\url{https://www.mediawiki.org/wiki/ORES} (\url{https://perma.cc/TH6L-KFT6})} (ORES) article quality models and adopted by several research studies of Wikipedia article quality \citep[e.g.][]{halfaker_ores_2020, kocielnik_reciprocity_2018, shi_wisdom_2019, warncke-wang_success_2015} was designed to address the first limitation by using article assessments at the time they were made as ``ground truth.'' Article quality might drift in the periods between assessments, but it seems safe to assume that new quality assessments are accurate at the time they are made. A model trained on recent assessments can predict what quality label an article would receive if assessed in its current state.
+
+%In this paper, I build on these models to address the second limitation by developing a one-dimensional measurement of article quality that does not assume that the quality levels are evenly spaced.
+
+This paper introduces a method for constructing interpretable one-dimensional measures of article quality from Wikipedia quality assessments and the ORES article quality model. The method improves upon prior approaches in two important ways. First, by using inverse probability weighting to calibrate the model, it is more accurate for typical research applications, and second, it does not depend on the assumption that quality levels are ``evenly spaced,'' which threatens the validity of prior research \citep{halfaker_interpolating_2017, arazy_evolutionary_2019}.  In addition, this paper helps us understand the validity of previous work by analyzing the performance of the ORES quality model and testing the ``evenly spaced'' assumption.
+
+In §\ref{sec:background}, I provide a brief overview of quality measurement in peer production research, in which I foreground the importance of the assumptions needed to use machine learning predictions in downstream analysis---particularly the ``evenly spaced'' assumption used by  \citet{halfaker_interpolating_2017} to justify the use of a handpicked weighted sum to combine article class probabilities.  Next, in §\ref{sec:methods}, I describe how to build accurate ordinal quality models that are appropriately calibrated for analyses of representative samples of Wikipedia articles or revisions. I also briefly explain how ordinal regression provides an interpretable one-dimensional measure of quality and how it relaxes the ``evenly spaced'' assumption.  Finally, in §\ref{sec:results} I present the results of my analysis to (1) show how the precision of the measurement depends on proper calibration and (2) demonstrate that the ``evenly spaced'' assumption is violated. Despite this, I find that scores from the ordinal models are highly correlated with those from prior work so the ``evenly spaced'' assumption may be acceptable in some applications. I conclude in §\ref{sec:discussion} with recommendations for measuring article quality in future research.
+
+\section{Background}
+\label{sec:background}
+
+
+% first point: measuring quality can help peer production projects
+% second point: measuring quality can help science
+
+% Mako thinks this is cute and it's fine to keep it but the bit about freezing mercury in the discussion takes it a bit far.
+Measurement is important to science as available knowledge often constrains the development of improved tools for advancing knowledge.  For example, in the book \textit{Inventing Temperature}, Hasok \citeauthor{chang_inventing_2004} \citep{chang_inventing_2004}, the philosopher and historian of science,  documents how extending theories of heat beyond the range of human sense perception required scientists to develop new types of thermometers. This in turn required better knowledge of heat and of thermometric materials such as the freezing point of mercury.   Part of the challenge of scientific advancement is that measurement devices developed under certain conditions may give unexpected results outside of the range in which they are calibrated: a thermometer will give impossibly low temperature readings when its mercury unexpectedly freezes. Today, machine learning models are used to extend the range of quality measurements in peer production research, but state of the art machine learning can be quite sensitive to the nuances of how their training data are selected \citep{recht_imagenet_2019}. 
+% This project introduces a new measurement device for measuring article quality and provides assurance that the measurement is reasonably accurate over the range of a given dataset.  
+
+\subsection{Measuring Quality in Peer Production}
+
+
+As described in §\ref{sec:introduction}, measuring quality has been of great importance to peer production projects like Wikipedia and in the construction of knowledge about how such projects work. The foundation of article quality measurement in Wikipedia has been the peer production of article quality assessment organized by WikiProjects who develop criteria for articles in their domain \citep{phoebe_ayers_how_2008}. This enables quality assessment to be consistent across different subject areas, but the procedures for assessing quality are tailored to the values of each WikiProject.  Yet, like human sense perception of temperature, these quality assessments are limited in that they require human time and attention. In addition, humans' limited ability to discriminate between levels on a scale limits the sensitivity of quality assessments. Articles are assessed irregularly and infrequently at the discretion of volunteer editors. Therefore, for most article revisions, it is not known what quality class the article would be assigned if it were newly assessed.
+
+% This paragraph is a bit lit reviewy and nonessential to the argument. Cut or reowrk. 
+Researchers have proposed many ideas to extend the range of quality measurement beyond the direct perception of Wikipedians, such as page length \citep{blumenstock_size_2008}, persistent word revisions \citep{adler_content-driven_2007, biancani_measuring_2014}, collaboration network structures \citep{raman_classifying_2020}, and template-based flaw detection \citep{anderka_predicting_2012}. Carefully constructed indexes benchmarked against English language Wikipedia quality assessments might allow quality measurement of articles that have not been assessed or in projects that have underproduced article assessments \citep{lewoniewski_relative_2017}. However, such indexes may lack emic validity if they fail to capture important aspects of quality or if notions of quality vary between linguistic communities and might even shape the editing activity in unexpected ways that could ultimately defeat their purpose \citep{goodhart_problems_1984,strathern_improving_1997}. Peer-produced quality labels depend on the limited capacity of volunteer communities to coordinate quality assessment, but also provide impressive validity for evaluating projects on their own terms.  
+
+\subsection{Article Quality Models Extend Measurement to Unassessed Articles} 
+
+Perhaps the most successful approaches to extending the range of quality measurements use machine learning models trained on available article quality assessments to predict the quality of revisions that have not been assessed.  The ORES article quality model (henceforth ORES) implements this approach, but other similar article quality predictors have been developed \citep{anderka_breakdown_2012,dang_quality_2016,zhang_history-based_2018,druck_learning_2008,sarkar_stre_2019,raman_classifying_2020}, and additional features including those based on language models can substantially improve classification performance compared to ORES \citep{schmidt_article_2019}. The ORES model is a tree-based classifier that predicts the quality class of a Wikipedia article at the time it is assessed.\footnote{The system uses cross-validation to select among candidates that include random-forest and boosted decision tree models.} These tree-based models are reasonable for practical purposes with the reported ability to predict within one level of the true quality class with 90\% accuracy (although in §\ref{sec:accuracy} I find a decline in accuracy in a more recent dataset). Yet, since these models do not account for the ordering of quality labels, the use of these predictions in downstream analysis introduces complicated methodological challenges. 
+
+
+The ORES classifiers are fit using \texttt{scikit-learn}\footnote{\url{https://scikit-learn.org/stable/}(\url{https://perma.cc/5Y8B-W8T5})} through minimization of the multinomial deviance as shown \citep{pedregosa_scikit-learn_2011,hastie_elements_2018}:
+% = -\sum_{k=1}^{K}I(y=\mathcal{G}_k)f_k(x) + log(sum_{l=1}^K(e^{f_l(x)}))
+\begin{equation}
+  L(y_i,p(x_i)) = -\sum_{k=1}^K{I(y_i=\mathcal{G}_{i,k})\mathrm{log}~p_k(x_i)}
+\label{eq:multinomial.loglik}
+\end{equation}
+
+\noindent For each article $i$ with predictors $x_i$ that has been labeled with a quality class $y_i$, the ORES model outputs an estimated probability $p_k(x_i)$ that the article belongs to each quality class $k \in \{\mathrm{\textit{stub}}, \mathrm{\textit{start}}, \mathrm{\textit{C-class}}, \mathrm{\textit{B-class}}, \mathrm{\textit{Good article (GA)}}, \mathrm{\textit{Featured article (FA)}}\}$. The predicted probabilities $p(x_i)$ sum to one so the ORES model outputs a unit vector for each article.  If  $\mathcal{G}_{i,k}$, the most probable quality class (MPQC) according to the model, is the true label, then $I(y_i=\mathcal{G}_{i,k})$ equals $1$ ($I$ is the indicator function) and the log predicted probability $p_k(x_i)$ of the correct class is subtracted from the loss $L(y_i,p(x_i))$. Note that this  model does not use the fact that article quality classes are ordered.  If it did, then it would have to penalize an incorrect classification of a \textit{Good article} as \textit{C-class} more than a classification of a \textit{Good article} as \textit{B-class}. In this model, different quality classes have no intrinsic rank or ordering and thus are akin to different categories of article subjects like animals, vegetables, or minerals.
+
+The MPQC is perhaps the most natural way to use the ORES output to measure quality. It has been used in several studies including to provide evidence that politically polarized collaboration on Wikipedia leads to high quality articles \citep{shi_wisdom_2019} and to understand the relationship between article quality and donation \citep{kocielnik_reciprocity_2018}.  However, the MPQC is limited in that it does not measure quality differences between articles that have the same MPQC. Consider two hypothetical articles; the first has the multinomial prediction $(0.1,0.3,0.4,0.075,0.075,0)$ and the second has the prediction $(0.075,0.075,0.4,0.3,0.1,0)$. The MPQC will assign both the \textit{C-class} label even though the first article has an even chance at being a \textit{Stub} or \textit{Start-class} while the second article has an even chance at being a \textit{B-class} or even a \textit{Good article}.  At best, the MPQC has limited sensitivity to subtle variations or gradual changes in quality \citep{halfaker_interpolating_2017}.  
+
+\subsection{Combining Scores for Granular Measurement}
+
+To further extend the range of article quality measurement within article quality classes, \citet{halfaker_interpolating_2017} constructed a numerical quality score using a linear combination (a weighted sum) of the elements of the multinomial prediction $p(x_i)$. This is advantageous from a statistical perspective as it naturally provides a continuous measure of quality which can typically justify a normal or log-normal statistical model. It can also support higher-order aggregations for measuring the quality of a set of articles \citep{halfaker_interpolating_2017}.  \citeauthor{halfaker_interpolating_2017} handpicks the coefficients $[0,1,2,3,4,5]$ to make a linear combination of the predictions under the assumption ``that the ordinal quality scale developed by Wikipedia editors is roughly cardinal and evenly spaced,'' which I refer to the ``evenly spaced'' assumption. It essentially says that a \textit{Start-class} article has one more unit quality of a \textit{Stub-class} article, and that a \textit{C-class} article has one more unit of quality than a \textit{Start-class} article and so on. This approach is being adopted by other researchers including  \citet{arazy_evolutionary_2019}.
+
+The considerable degree of effort and expertise required to raise articles to higher levels of quality raises doubt in the assumption \citep{jemielniak_common_2014}.  Higher quality levels correspond to increasing completeness, encyclopedic character, usefulness to wider audiences, incorporation of multimedia, polished citations, and adherence to Wikipedia's policies. The English language Wikipedia editing guideline on content assessment\footnote{\url{https://en.wikipedia.org/w/index.php?title=Wikipedia:Content_assessment&oldid=1023695750} (\url{https://perma.cc/2JUV-6SD})} defines a \textit{Good article} as ``useful to nearly all readers, with no obvious problems'' and a \textit{Featured article} article as ``professional, outstanding and thorough.'' According to Wikipedians, it can take ``three to six months of full time work'' to write a \emph{Featured article}.\footnote{Public statement by Stuart Yeates, an expert Wikipedian; quoted with permission. \url{https://lists.wikimedia.org/hyperkitty/list/wiki-research-l@lists.wikimedia.org/message/7U35LHAXRWEPABN75DOTPOIEA2VYCTQQ/} (\url{https://perma.cc/9V4P-WRXR})}  Are we to assume that the difference in quality between a \textit{Good article} and a \textit{Featured article} is measurably the same as that between a \textit{Stub} defined as as ``little more than a dictionary definition'' and a \textit{Start-class} that is ``a very basic description of the topic?'' How could we even answer this question? 
+%This paper provides a methodology to answer it, but the answer depends on how quality is measured.
+
+If the ``evenly spaced'' assumption is reasonable, then \citeauthor{halfaker_interpolating_2017}'s weighted sum approach is too. But if increasing Wikipedia article classes do not represent roughly equal improvements in quality, this may threaten the accuracy of analysis dependent on the assumption. Suppose that a \textit{B-class} has not 1, but 2 units of quality greater than a \textit{C-class} article, then \citeauthor{halfaker_interpolating_2017} could have underestimated the improvement in the knowledge gap of women scientists, which was considerably driven by improvement in \textit{B-class} articles.  In the next section, I provide a straightforward extension of the ORES article quality model based on ordinal regression can both relax the ``evenly spaced'' assumption and provide a better calibrated and more accurate one-dimensional measure of quality.
+
+%I now describe my implementation of the approach. I will then evaluate my model in terms of predictive accuracy, the spacing of quality levels, and comparison with prior approaches.
+
+
+\section{Data, Methods and Measures}
+\label{sec:methods}
+
+%\citeauthor{halfaker_interpolating_2017} \cite{halfaker_interpolating_2017} constructed a one-dimensional measure of article quality using handpicked linear combination of the ORES category predictions assuming that quality levels are evenly spaced. I choose the linear combination 
+
+I use Bayesian ordinal regression models that use the ORES predicted probabilities to predict the quality class labels and quantify the distance between quality classes. I now provide a brief overview of ordinal regression as needed to explain my approach to measuring quality. Understanding ordinal regression depends on background knowledge of odds and generalized linear models.  I recommend \citet{mcelreath_statistical_2018} for reference.
+
+\subsection{Bayesian Ordinal Regression}
+
+Ordinal regression predicts quality class membership using a single linear model for all classes and identifies boundaries between classes using the log cumulative odds link function shown below in Eq. \ref{eq:ordinal.regression}.  The log cumulative odds is not the only possible choice of link function, but it is the most common, is the easiest to interpret, and is appropriate here.
+
+\begin{align}
+  \mathrm{log}&~\frac{\mathrm{Pr}(y_i \le k)}{1 - \mathrm{Pr}(y_i \le k)} = \alpha_k - \phi_i  \label{eq:ordinal.regression} \\
+  \phi_i &= B x_i \nonumber 
+\end{align}
+\noindent As in Eq. \ref{eq:multinomial.loglik}, $y_i$ is the quality label for article $i$. The left hand side of Eq. \ref{eq:ordinal.regression} gives the log odds that $y_i$ is less than or equal to quality level $k$. The ordinal quality measure is given by a linear model $\phi_i = B x_i$ ($x_i$ is a vector of transformed ORES scores for article $i$).  Key to interpreting $\phi_i$ as a quality measure are the intercept parameters $a_k$ for each quality level $k$. The log cumulative odds (the log odds that the article $y_i$ has quality less than or equal to $k$) are given by the difference between the intercept and the linear model $a_k$ - $\phi_i$. Therefore, if $\phi_i = \alpha_k$ then the chances that $i <= k$ equal the chances that $i > k$. When $\phi_i$ is less than $\alpha_k$, the quality of article $i$ is probably less than or equal to quality level $k$.  As $\phi_i - \alpha_k$ increases so do the chances that article $i$ is of quality better than $k$. In this way, the threshold parameters $a_k$ define quantitative article quality levels on the scale of the ordinal quality measure $\phi_i$.
+
+Informally, an ordinal regression model maps a linear regression model to the ordinal scale using the log cumulative odds link function. It does this by inferring thresholds that partition the range of linear predictions. When the linear predictor for an article crosses a threshold, the probability that the article has quality greater than that corresponding to the threshold begins to increase.
+
+Bayesian inference allows interpreting model parameters like $\phi_i$ and $\alpha_k$ as random variables and provides accurate quantification of uncertainty in thresholds and predictions. I fit models using the R package Bayesian regression modeling using Stan (\texttt{brms}) \citep{burkner_brms_2017} version 2.15.0. I use the default priors for ordinal regression, which are weakly informative.  Due to the large sample size, the data overwhelm the priors and the priors have little influence over results.  I confirmed this by fitting equivalent frequentist models using the \texttt{polr} function in the \texttt{MASS} R package \citep{venables_modern_2002} and found that the estimates of intercepts and coefficients were very close. 
+
+% "all useful information" not strictly true
+The six quality scores output by the ORES article quality classifier are perfectly collinear by construction because they sum to one. This means they cannot all be included in the same regression model.  Since interpreting the coefficients is not important, I take the linear transformation of the ORES scores using appropriately weighted principle component analysis and use the first five principle components as the independent variables. This is simpler and more statistically efficient than a model selection procedure.
+
+%I fit 3 ordinal regression models, one for each of the units of analysis using weights as described below in §\ref{sec:data}. The use of different weights is important to ensure that the model, and therefore the resulting quality scale is well calibrated to the chosen unit of analysis as shown in Figure \ref{fig:calibration}. To further demonstrate the importance of calibrating the models to the correct unit of analysis, I report the accuracy of each model (and of the MPQC) on each weighted dataset in §\ref{sec:accuracy}.
+
+\subsection{Dataset and Model Calibration}
+\label{sec:data}
+
+I draw a new random sample of 5,000 articles from each quality class to develop my models. I first reuse code from the \texttt{articlequality}\footnote{\url{https://pypi.org/project/articlequality} (\url{https://perma.cc/8R4H-MAZ9})} Python package to process the March 2020 XML dumps for English Wikipedia and extract up-to-date article quality labels.  I then select pages that have been assessed by a member of at least one WikiProject. Following prior work, if an article is assessed at different levels according to more than one WikiProject, I assign it to the highest such level and I drop articles having the rarely used \emph{A-class} quality level \citep{halfaker_interpolating_2017,warncke-wang_success_2015,warncke-wang_tell_2013}.  Next, I use the \texttt{revscoring}\footnote{\url{https://pypi.org/project/revscoring} (\url{https://perma.cc/3HFN-V23Z})} Python package to obtain the ORES scores of the labeled article versions. Some of these versions have been deleted leading to missing observations at each quality level. Table \ref{tab:sample} shows the number of articles sampled in each quality class. I reserve a random sample of 2000 articles which I use in reporting my results and fit my ordinal regression models on the remainder.   
+
+%For a fair comparison of predictive accuracy, I holdout a random sample of r2[['n.holdout']] articles.  
+%From these labeled articles I draw a new stratified sample to enable the use of a smaller sample that is ``balanced,'' meaning that it has equal sample sizes for all article classes as shown in Table \ref{tab:sample}. 
+The ORES article quality classifiers are fit on a ``balanced'' dataset having an equal number of articles in each quality class.  Thus, an ORES score is the probability that an article is a member of a quality class under the assumption that the article was drawn from a population where each quality class contains an equal number of articles.  Simply put, the model has learned from its training data that each quality class is about the same size. 
+
+\begin{figure}
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\maxwidth]{figures/calibration-1} 
+\end{knitrout}
+\caption{Calibration of each predictive quality model on datasets representative of each unit of analysis (article, revision, quality class). Each chart shows, for each quality class, the miscalibration of a model (columns) with respect to a dataset weighted to represent a unit of analysis (rows). The y-axis shows difference between the true probability of the quality class and the average predicted probability of that class, given a chosen unit of analysis. Points close to zero indicate good calibration. For example, the top-left chart shows that the article model is well-calibrated to the dataset on which it was fit and the middle-left chart shows that the article model predicts that articles are \textit{Stubs} with probability greater than the frequency of \textit{Stubs} in a random sample of revisions. Error bars show 95\% confidence intervals. \label{fig:calibration}}
+\end{figure}
+
+
+This is not representative of the overall article quality on Wikipedia, which is highly skewed with over 3 million \textit{Stubs} but only around \textit{7,000} \textit{Featured articles} as shown in Table \ref{tab:sample}. Although using a balanced dataset likely improves the accuracy of the ORES models, for the ordinal regression models, the choice of unit of analysis presents a trade-off between accuracy in a representative sample of articles or revisions and accuracy within each quality class. 
+Constructing a balanced dataset by oversampling is a common practice in machine learning because it can improve predictive performance. However, oversampling can also lead to badly calibrated predictive probabilities as shown in Fig. \ref{fig:calibration}.  Calibration means that, on average, the predicted probability of a quality class equals the average true probability of that class for the unit of analysis. 
+
+The ``balanced'' dataset on which ORES is trained has the \textit{quality class} unit of analysis because each quality class has equal representation. However, researchers are more interested in analyzing representative samples of \textit{articles} or \textit{revisions}. For example, the article unit of analysis would be used to estimate the average quality of a random sample of articles and the revision unit of analysis might be used to model the change in the quality of an encyclopedia over time. 
+Weighting allows the use of the balanced dataset to estimate a model as if the dataset were a uniform random sample of a different unit of analysis.
+My method uses a balanced dataset to fit ordinal regression models with inverse probability weighting to calibrate each model to the unit of analysis of a research project.
+For example, each article in the model calibrated to the article unit of analysis is weighted by the probability of its quality class in the population of articles divided by the probability of its quality class in the sample. The size of the sample and the weights for the article and revision levels of analysis are also shown in Table \ref{tab:sample}.
+% It turns out that the ``evenly spaced'' assumption is sensitive to the unit of analysis.  
+
+
+\begin{table}
+\caption{Number of articles sampled at each quality level}
+% latex table generated in R 4.0.4 by xtable 1.8-4 package
+% 
+\begin{tabular}{lrrrrr}
+  \hline
+Label & No. of articles & No. of revisions & Sample size & Article weights & Revision weights \\ 
+  \hline
+Stub & 3,359,351 & 12,005,611 & 4,969 & 4.23 & 2.52 \\ 
+  Start & 1,019,038 & 7,828,335 & 4,979 & 1.28 & 1.64 \\ 
+  C & 235,655 & 3,889,639 & 4,988 & 0.30 & 0.81 \\ 
+  B & 128,875 & 3,640,591 & 4,990 & 0.16 & 0.76 \\ 
+  GA & 31,808 & 924,468 & 4,999 & 0.04 & 0.19 \\ 
+  FA & 7,438 & 365,255 & 4,995 & 0.01 & 0.08 \\ 
+   \hline
+\end{tabular}
+
+\label{tab:sample}
+\end{table}
+
+% This requires dropping one of the scores, but it is not obvious which one should be dropped.  For both the weighted and unweighted models, I fit six models each dropping a different scores and then use approximate leave-one-out cross validation (LOO-CV) implemented in the \textsc{loo} R package to choose \cite{vehtari_practical_2017}. LOO-CV takes advantage of the Bayesian model to accurately and reliable calculate the expected log out-of-sample pointwise predictive accuracy (ELPD) using Pareto smoothed importance sampling. The choice does not matter much as the standard errors of the ELPD differences are not much smaller than the differences themselves. As shown in Table \ref{tab:loo.comparison}, the best models according to the ELPD have the \textit{start-class} score removed for models with weights and with the \textit{stub-class} score removed for the unweighted models. I therefore use these models from here on. 
+
+\section{Results}
+\label{sec:results}
+I first report my findings about the spacing of the quality classes in each of the models in §\ref{sec:spacing}.   Quality classes are not evenly spaced, especially when articles or revisions are the unit of analysis.  Next, in §\ref{sec:accuracy}, I report the accuracy of each of the models and the uncertainty of the ordinal quality scale.  All models perform similarly to or better than the MPQC within the pertinent unit of analysis. The unweighted model provides the best accuracy and lowest uncertainty across the entire range of quality levels, but is poorly calibrated for other units of analysis. Finally, in §\ref{sec:correlation}, I show that all quality measures are highly correlated, but the ordinal quality measures agree with one another more than with the ``evenly spaced'' measure.
+
+
+
+\begin{figure}
+\centering
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\maxwidth]{figures/fig_spacing-1} 
+\end{knitrout}
+\caption{Quality scores and predictions of the ordinal regression models. Columns in the grid of charts correspond to the ordinal quality model calibrated to the indicated unit of analysis and rows correspond to sampled articles having the indicated level of quality as assessed by Wikipedians. Each chart shows the histogram of scores, thresholds inferred by the ordinal model with 95\% credible intervals colored in gray, and colors indicating when the model makes correct or incorrect predictions.  The thresholds are not evenly spaced, especially in \textit{revision model} and \textit{article model} that has more weight on lower quality classes. These two models infer that the gaps between \textit{Stub} and \textit{Start} and between \textit{Start} and \textit{C-class} articles are considerably wider than the gap between \textit{C-class} and \textit{B-class} articles. \label{fig:spacing}}
+\end{figure}
+
+
+\subsection{Spacing of Quality Classes}
+\label{sec:spacing}
+
+The grid of charts in Fig. \ref{fig:spacing} shows quality scores and thresholds for each model (columns) and article quality level (rows). Each chart shows the histogram of quality scores $\phi_i$ given to articles having the true quality label corresponding to the row of the grid. The histograms are colored to indicate regions where the model correctly predicts that articles belong to their true class. Vertical dashed lines show the thresholds inferred by the model with 95\% credible intervals colored in gray. Different models have different ranges of scores, so Fig. \ref{fig:spacing} shows results normalized between 0 and 1.
+
+
+
+
+No matter the unit of analysis, article quality classes are not evenly spaced.  The quality class model provides a quality scale in which  \textit{Featured} articles take up $27\%$ of the scale and are expected to score in the range of $[0.73, 1]$, but probable \textit{C-class} articles only span $14\%$ of the scale in the range $[0.31, 0.45]$.  Researchers are likely to be interested in models calibrated to the article or revision units of analysis, and in these cases, the quality classes are far from evenly spaced. The \textit{revision model} assigns $28\%$ of the scale to \textit{Stubs}, from $0$ to $0.28$. It assigns \textit{C-class} articles the smallest part of the scale, only $4\%$ of it, from $0.54$ to $0.58$.  The \textit{article model} is even more extreme. It assigns \textit{Stubs} to the interval $[0, 0.39]$, $39\%$ of the scale, and the space between thresholds defining the range of \textit{C-class} articles is so narrow that it virtually never predicts that an article will be C-class.  In general terms, the \textit{quality class model} gives relatively equal amounts of space to each quality class compared to the other models, while reserving nearly the top half of the scale for the top 2 quality classes.  The \textit{revision model} and \textit{article model} do the opposite and use the bottom half of the scale to account for differences within the bottom two quality classes, leave some room for \textit{B-class} articles, but squeeze the top end of the scale and \textit{C-class} articles into relatively small intervals. 
+
+
+%spacing between the levels is relatively even compared to the other units of analysis. A greater range of the ordinal quality scale is given to \textit{Featured} articles than to \textit{Good} articles, and a smaller range is given to \textit{C-class} and \textit{B-class} articles. Things are quite different in circumstances more likely to be of interest to researchers: when the units of analysis are revisions or articles. In both cases a large range of the scale is taken by \textit{Stub} and \textit{Start-class} articles at bottom of the scale;  \textit{C-class} articles have a quite small range of the scale, perhaps due to the difficulty in distinguishing them from \textit{B} or \textit{Start-class} articles; and \textit{Good} and \textit{Featured} articles are given some part of the scale, but substantially less than when the unit of analysis is the quality class. 
+
+
+\subsection{Accuracy and Uncertainty}
+\label{sec:accuracy}
+
+I evaluate predictive performance in terms of \textit{accuracy}, the proportion of predictions of article quality that are correct.  To allow comparison with the reported accuracy of the ORES quality models, I also report \textit{off-by-one accuracy}, which includes predictions within one level of the true quality class among correct predictions.
+
+\begin{table}
+\caption{Accuracy of quality prediction models depends on the unit of analysis. The greatest accuracy and off-by-one accuracy scores are highlighted.  Models are more accurate when calibrated on the same unit of analysis on which they are evaluated.  Compared to the MPQC, the ordinal quality models have better accuracy when revisions or articles are the unit of analysis.  When the quality class is the unit of analysis, the ordinal quality model has worse accuracy, but predicts within one quality class with slightly better accuracy. \label{tab:accuracy}}  
+% latex table generated in R 4.0.4 by xtable 1.8-4 package
+% 
+\begin{tabular}{lllll}
+  \hline
+Unit of analysis & Model & Ordinal model? & Accuracy & Off-by-one accuracy \\ 
+  \hline
+Quality class & Article & Yes & 0.33 & 0.75 \\ 
+  Quality class & Revision & Yes & 0.44 & 0.84 \\ 
+  Quality class & Quality class & Yes & 0.52 & \cellcolor{mygreen}0.87 \\ 
+  Quality class & ORES MPQC & No & \cellcolor{mygreen}0.55 & 0.86 \\ 
+   \hline
+Revision & Article & Yes & 0.57 & 0.87 \\ 
+  Revision & Revision & Yes & \cellcolor{mygreen}0.61 & \cellcolor{mygreen}0.92 \\ 
+  Revision & Quality class & Yes & 0.54 & 0.88 \\ 
+  Revision & ORES MPQC & No & 0.58 & 0.9 \\ 
+   \hline
+Article & Article & Yes & \cellcolor{mygreen}0.76 & \cellcolor{mygreen}0.97 \\ 
+  Article & Revision & Yes & 0.73 & 0.96 \\ 
+  Article & Quality class & Yes & 0.63 & 0.92 \\ 
+  Article & ORES MPQC & No & 0.65 & 0.94 \\ 
+   \hline
+\end{tabular}
+
+ \end{table}
+
+As shown in Table \ref{tab:accuracy}, the ordinal regression models have better predictive ability than the MPQC except when the unit of analysis is the quality class.  In this case, the best ordinal quality model has worse accuracy than the MPQC but slightly better off-by-one accuracy. Table \ref{tab:accuracy} shows accuracy and off-by-one accuracy weighted for each unit of analysis. Accuracy for a given unit of analysis depends on having a model fit to data representative of that unit of analysis. Accuracy scores are higher when greater weight is placed on lower article quality classes, suggesting that it is easier to discriminate between these classes.
+
+The ORES article quality model has been quickly adopted by researchers, but its accuracy is limited.  While off-by-one accuracy is above 90\% when the article is the unit of analysis, the MPQC only predicts the correct quality class 55\% of the time when the quality class is the unit of analysis.     
+
+The trade-offs in selecting a unit of analysis on which to calibrate the models are further illustrated by Fig. \ref{fig:uncertainty}, which plots the size of the 95\% credible intervals as a function of the quality scores for each model. As in Fig. \ref{fig:spacing}, quality scores in this plot are rescaled between 0 and 1. The models calibrated to articles or revisions have more certainty in the lower range of the quality scale compared to the model that places equal weight in all quality classes.  This comes with a trade-off for the higher range of quality.  While the \textit{quality class model} has relatively low uncertainty across the entire range of quality, the \textit{revision model} and \textit{article model} have greater uncertainty at higher levels of quality.   
+
+\begin{figure}
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\maxwidth]{figures/uncertainty-1} 
+\end{knitrout}
+\caption{Uncertainty in ordinal quality scores for models calibrated at each unit of analysis.  Points show the size of the 95\% credible interval for the ordinal quality score for each article in the dataset. The quality class model has low uncertainty across the range of quality. Models calibrated to the revision and article levels of analysis have less uncertainty at the low end of the quality scale, but greater uncertainty at the higher end of the scale. \label{fig:uncertainty}}
+\end{figure}
+
+\subsection{Correlation Between Scores}
+\label{sec:correlation}
+
+Although the models have different predictive performances and uncertainties, as measures of quality, they are nearly perfectly correlated with one another as shown in Fig. \ref{fig:correlation}. For each quality score, including the ``evenly spaced'' weighted sum, Fig. \ref{fig:correlation} shows a scatter plot and two correlation statistics: Kendall's $\tau$ and Pearson's $r$.  Pearson's $r$ is the standard linear correlation coefficient and Kendall's $\tau$ is a nonparametric rank-based correlation defined as the probability that the quality scores will agree about which of any two articles has higher quality minus the probability that they will disagree.
+
+According to Pearson's $r$ all the quality scores are highly correlated with correlation coefficients of about $0.98$ or higher. Kendall's $\tau$ measures nonlinear correlation and reveals discrepancies between the ordinal models and the ``evenly spaced'' measures. The Pearson correlation between scores from the \textit{revision model} and the scores from the \textit{quality class model} are about the same as the correlation between the \textit{revision model} scores and the  ``evenly spaced'' scores ($r=0.98$). However, according to Kendall's $\tau$, scores from the \textit{revision model} are more similar to those from the \textit{quality class model}  ($r=0.98$) than to the scores from the ``evenly spaced'' approach ($r=0.9$).
+
+The evenly spaced model is more likely to disagree with the model-based scores than any of the model-based scores are to disagree with one another as visualized in the scatter plots in Fig. \ref{fig:correlation}. Disagreement between the ``evenly spaced'' method and the ordinal models is greatest among articles in the middle of the quality range. 
+
+\begin{figure}
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\maxwidth]{figures/score_correlation-1} 
+\end{knitrout}
+  \caption{Correlations between quality measures show that the different approaches to measuring quality are quite similar.  ``Evenly spaced'' uses a weighted sum of the ORES scores with handpicked coefficients \citep{halfaker_interpolating_2017}. Lower values of Kendall's $\tau$, a nonparametric rank correlation statistic, compared to Pearson's $r$ suggest nonlinear differences between the weighted sum and the other measures. \label{fig:correlation}}
+\end{figure}
+
+
+\section{Discussion}
+\label{sec:discussion}
+Past efforts to extend the measurement of Wikipedia article quality from peer-produced article quality assessments to unassessed versions of articles and from the discrete to the continuous domain have relied upon machine learning and expedient but untested assumptions like that quality levels are ``evenly spaced.''  
+% I argued in §\ref{sec:background} that using machine learning to extend the article quality measurement from the direct observation of human article assessment to unobserved articles and from the discrete levels to a continuous scale might be analogous to how thermometry extended into new extremes of hot and cold where assumptions like the liquidity of mercury break down. Scientists, unaware that mercury has a solid state were baffled and misled by impossibly low temperature readings from thermometers in which the mercury had unexpectedly frozen \cite{chang_inventing_2004}. 
+While I suggest technical improvements for statistical models for measuring quality, I also find that scores from my models are highly correlated to those obtained under the ``evenly spaced'' assumption.
+
+I set out to provide a better way to convert the probability vector output by the ORES article quality model into a continuous scale and to test the assumption that the quality levels are evenly spaced.  I used ordinal regression models to infer spacing between quality levels and used the linear predictor of these models as a continuous measure of quality.  While I found in §\ref{sec:spacing} that the quality levels are not evenly spaced and that the spacing depends on the unit of analysis to which the models are calibrated, I also showed in §\ref{sec:correlation} that the model-based quality measures are highly, although not perfectly, correlated with  the ``evenly spaced'' measure.  This provides some assurance that past results built on this measure are unlikely to mislead. That said, I recommend that future work adopt appropriately calibrated model-based quality measures instead of the ``evenly spaced'' approach, and I argue that it is important to improve the accuracy of article quality predictors to enable more precise article quality measurement.
+
+\subsection{Recommendations for Measuring Article Quality}
+How should future researchers approach the question of how to measure Wikipedia article quality?  While I cannot provide a final or complete answer to the question, I believe the exercise reported in this paper provides some insights on which to base recommendations. It is important to note that I consider here only approaches to measuring quality that assume the use of a good predictor of article quality assessment, such as the ORES quality model.  I do not consider other based approaches such as those based on indexes  \citep{lewoniewski_relative_2017} described in §\ref{sec:background}. 
+
+\subsubsection{Use the principle components of ORES scores for statistical control of article quality}
+In many statistical analyses, the only purpose of measuring quality will be as a statistical control or adjustment. For example, \citet{zhang_crowd_2017} used the MPQC as a control variable in a propensity score matching analysis of promotion to \textit{Featured article} status, but as argued in §\ref{sec:methods}, the MPQC provides less information than the vector of ORES scores. Using the principle components is simpler than using an ordinal quality model. I recommend obtaining ORES scores for your dataset, taking the principle components, and dropping the least significant one to remove collinearity. 
+
+\subsubsection{Use ordinal quality scores when article quality is an independent variable}
+\label{sec:qciv}
+In other cases, research questions will ask how article quality is related to an outcome of interest, like how \citet{kocielnik_reciprocity_2018} set out to explore factors associated with donations to the Wikimedia Foundation. They use the MPQC as an independent variable, which complicates their analysis.  Although they conclude that ``pages with higher quality attract more donations,'' this is not strictly true. They actually found a nonlinear relationship where readers of \textit{B-class} articles were more likely to donate than readers of \textit{Featured articles}. Using a continuous measure of quality is more convenient when the average linear relationship is the target of inference.  
+
+I recommend using an ordinal regression model appropriate to the downstream unit of analysis because this will justify the interpretation of the measure.  If the downstream unit of analysis differs substantively from those used here, such as if different selection criteria are applied, I recommend reusing my code to calibrate a new ordinal regression model to a new dataset.  Otherwise, reusing one of my models should be adequate. Finally, in the Bayesian framework, the scores are interpretable as random variables.  This provides a justification for incorporating the variance of these scores as measurement errors to improve estimation in downstream analysis \citep{mcelreath_statistical_2018}. 
+
+% Although the ``evenly spaced'' scores and the scores based on ordinal regression are highly correlated, there are a number of reasons to prefer my approach. 
+% The most important is simply that it requires no strong assumptions about the relationships between levels of article quality. Rather, it learns both the spacing between quality levels and the best combination of ORES scores for predicting article quality assessment.  
+
+% Second, the scores have grounded statistical interpretations as the linear predictor in an ordinal quality model.  Given the intercepts from the model, the scores are directly interpretable as a probability distribution over article quality classes.  
+
+
+\subsubsection{Use the MPQC or ordinal quality scores when article quality is the dependent variable}
+
+Using the MPQC as the outcome in an ordinal regression model, as is done by \citet{shi_wisdom_2019} in their analysis of Wikipedia articles with politically polarized editors, is a reasonable choice as long as it provides sufficient variation and a more granular quality measure is not needed. Although it is theoretically possible that using the MPQC might introduce statistical bias because it less accurate than ordinal quality scores for units of analysis other than the quality class and omits variation within quality classes, such threats to validity do not seem more significant than the threat introduced by inaccurate predictions. If the MPQC does not provide sufficient granularity and a continuous measure is desired as in \citet{halfaker_interpolating_2017} or \citet{arazy_evolutionary_2019}, I recommend using a measure based on ordinal regression as described in §\ref{sec:qciv}.
+
+
+\subsection{Limitations}
+
+Although intuitions about the varying degrees of effort required to develop articles with different levels of quality led me to question the ``evenly spaced'' assumption, my findings that quality classes are not evenly spaced do not necessarily reflect relative degrees of effort.  Rather, spaces between levels are chosen to link a linear model to ordinal data.  The spacing of intervals depends on the ability of the ORES scores to predict quality classes. The ORES article quality model has relative difficulty classifying \textit{C-class} and \textit{B-class} articles \citep{halfaker_interpolating_2017}. Perhaps, the differences between these quality classes are minor compared to the other classes. Maybe ORES lacks the features or ability to model these differences and the space between these classes will grow if its predictive performance improves.  
+
+The usefulness of article quality scores depends on the accuracy of the model. The ORES quality models are accurate enough to be useful for researchers, but they still only predict the correct quality class 55\% of the time on a balanced dataset.  Of course, this limits the accuracy of the ordinal regression models reported here. 
+Furthermore, while the ORES quality models were designed with carefully chosen features intended to limit biases \citep{halfaker_ores_2020}, it is still quite plausible that the accuracy of predictive quality models may vary depending on characteristics of the article \citep{kleinberg_inherent_2016}.  Such inaccuracies may introduce bias, threaten downstream analysis or lead to unanticipated consequences of collaboration tools built upon the models \citep{teblunthuis_effects_2021}. Therefore, improving the accuracy of article quality prediction models is important to the validity of future article quality research. Adopting machine learning models that can incorporate ordinal loss functions is a promising direction and can reduce the need for auxiliary ordinal regression models \citep{cardoso_learning_2007}.
+
+This paper only considers measuring article quality for English language Wikipedia, but expanding knowledge of collaborative encyclopedia production depends on studying other languages as audiences and collaborative dynamics can greatly vary between projects \citep{hecht_tower_2010,lemmerich_why_2019,teblunthuis_dwelling_2019}.  Other languages carry out quality assessments \citep{lewoniewski_relative_2017}, and some of these have been used to build ORES article quality models.  Future work should extend this project to provide multilingual article quality measures in one continuous dimension.
+
+An additional limitation stems from the likelihood that peer-produced quality labels are biased. For instance, the English Wikipedia community has a well-documented pattern of discrimination against content associated with marginalized groups such as biographies of women \citep{tripodi_ms_2021, menking_people_2019} and indigenous knowledge \citep{van_der_velden_decentering_2013}. Although demonstrating biases in article quality assessment is a task for future research, if Wikipedians' assessments of article quality are biased then model predictions of quality will almost certainly be as well.   
+
+
+\section{Conclusion}
+Measuring article quality in one continuous dimension is a valuable tool for studying the peer production of information goods because it provides granularity and is amenable to statistical analysis.  Prior approaches extended ORES article quality prediction into a continuous measure under the ``evenly spaced'' assumption. I showed how to use ordinal regression models to transform the ORES predictions into a continuous measure of quality that is interpretable as a probability distribution over article quality levels, provides an account of its own uncertainty and does not assume that quality levels are ``evenly spaced.''  Calibrating the models to the chosen unit of analysis improves accuracy for research applications. I recommend that future work adopt this approach when article quality is an independent variable in a statistical analysis.
+
+\section{Code and Data Availability}
+Code, data and instructions for replicating or reusing this analysis are available in the Harvard Dataverse at \url{https://doi.org/10.7910/DVN/U5V0G1}.
+
+\section*{Acknowledgements}
+
+I am grateful to the members of the Community Data Science Collective for their feedback on early drafts of this work including Kaylea Champion, Sneha Narayan, Jeremy Foote, and Benjamin Mako Hill. I would also like to thank Aaron Halfaker for encouraging me to write this after seeing a preliminary version. Thanks to Stuart Yeates and other participants in the \texttt{wiki-research-l} mailing list \url{wiki-research-l@lists.wikimedia.org} for answering my questions about measuring article quality and effort. Finally, thank you to the anonymous OpenSym reviewers whose careful and constructive feedback improved the paper.
+
+
+% bibliography here
+\setcounter{biburlnumpenalty}{9001}
+\printbibliography[title = {References}, heading=secbib]
--- a/dissertations/nathante_uw_2021/appendix_B_readingtime.tex
+++ b/dissertations/nathante_uw_2021/appendix_B_readingtime.tex
@ -0,0 +1,659 @@
+
+\chapterprecishere{
+Much existing knowledge about global consumption of peer-produced information goods is supported by data on Wikipedia page view counts and surveys. In 2017, the Wikimedia Foundation began measuring the time readers spend on a given page view (dwell time), enabling a more detailed understanding of such reading patterns. In this paper, we validate and model this new data source and, building on existing findings, use regression analysis to test hypotheses about how patterns in reading time vary between global contexts. 
+Consistent with prior findings from self-report data, our complementary analysis of behavioral data provides evidence that Global South readers are more likely to use Wikipedia to gain in-depth understanding of a topic. We find that Global South readers spend more time per page view and that this difference is amplified on desktop devices, which are thought to be better suited for in-depth information seeking tasks.
+% Olga's comment: I would switch "patterns in reading time vary between global contexts" above with the previous mention of Global South.  i.e. "Here we build on findings about patterns of readership varying between global contexts from recent large-scale..." and then below say "how patterns in reading time vary between the Global South and Global North"
+%We also observe patterns consistent with skills gaps between Global South and Global North audiences as Global South readers also seem to spend more time 
+% consider knowledge gaps
+%Here, we use this data to answer questions like: How does time spent vary from language edition to language edition, or between different kinds of readers or articles?  How can we determine whether a new design change increases the time spent by readers?  
+%This report explores this newly available data, to provide insights on how people use Wikipedia that can inform and influence our future product direction. 
+%This data allows us to answer questions like: 
+%We validate this data and begin to answer questions such as the ones above. We observe the limitations of the data, most notably a high rate (57\%) of missing data on mobile devices. It is important to consider these shortcomings, but we believe that the data can be fruitfully applied to improve our current knowledge of how people read Wikipedia. We used regression analyses to explore how factors like page length, device choice, and the locations of readers are related to reading times. We believe that our results for device choice and reader location offer behavioral data to corroborate findings from a large-scale survey of Wikipedia readers of 14 language editions \cite{lemmerich_why_2019}.
+}
+
+\section{Introduction}
+
+%How does Wikipedia readership vary across different geographic and developmental contexts?  
+% Perhaps this can be more general 
+How do Wikipedia readers vary across different geographic and developmental contexts?
+%information-seeking tasks differ between people in countries compared to more developed countries? 
+A recent study of readers of different Wikipedia language editions found that readers in countries with a lower human development index (HDI) were more likely to read for in-depth understanding compared to readers in high-HDI countries \citep{lemmerich_why_2019}.  However, this study is limited by the use of self-reported data, which can be biased by effects of social desirability and self-selection due to the volunteer nature of web-based surveys \citep{antin_social_2012, hill_wikipedia_2013, kiesler_response_1986,  phillips_effects_1972}. This study provides additional support for this finding from large-scale observation of reading behavior across contexts with varying levels of development.
+
+Wikipedia contributors generally start as Wikipedia readers Therefore understanding and better supporting readership is important for the continued growth of the Wikimedia movement \citep{preece_reader--leader_2009}. 
+In 2017, the Wikimedia Foundation's web team introduced new instrumentation to measure the amount of time Wikipedia readers spend on the pages they view. We utilize this newly available data source, which provides additional information over the widely used page view data.  With reading times in our field of view, it becomes clear that not all views are created equal. Some page views seem to involve in-depth reading, yet most are quite short.   
+
+
+%In that sense, results such as ours also contribute to the knowledge about open collaboration processes, complementing research that is based on the detailed contributor data available from MediaWiki wikis. 
+
+% \url{https://meta.wikimedia.org/wiki/Research:Which_parts_of_an_article_do_readers_read\#Eyetracking}{eye 1tracking}, 
+% 
+%\url{https://meta.wikimedia.org/wiki/Research:Which_parts_of_an_article_do_readers_read
+% maybe we should replace this figure a similar one showing the medians for each group. 
+
+\begin{figure}
+\centering
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\columnwidth]{figures/GN_session_device_plot-1} 
+\end{knitrout}
+%\includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/17.png}
+\caption{Marginal effects plot showing dwell times on Wikipedia pages predicted by our regression model. Compared to readers in the Global North, readers in the Global South spend substantially more time reading when on desktop devices.\label{fig:model1bplot}}
+\end{figure} 
+
+We begin our analysis by evaluating the quality of the adopted approach for measuring reading times. We find limitations including a high rate of missing data on mobile devices and a low rate of invalid (missing or negative) measurements. However, we believe that the data can be generally informative as long as these limitations are considered. We then present a summary of the data and estimate the total time spent reading Wikipedia. 
+
+Next we evaluate probability models for reading time data. In addition to validating assumptions that underlie the use of parametric statistics and regression models used in answering our research questions, model selection can also help evaluate theorized data generating processes that predict when a given model will be a good fit for the data \citep{mitzenmacher_brief_2004, stumpf_critical_2012}. For instance, Liu et al. (2010) analyze dwell times using Weibull models, finding evidence for ``screen-and-glean'' patterns in which people first spend a short amount of time to assess a web page, and then decide whether to read it in-depth \citep{liu_understanding_2010}. We evaluate several probability distributions on the data from Wikipedia readers, and find that the Weibull model is not a good fit, but that the log-normal distribution fits the data well enough to justify using the geometric mean as a metric.
+
+Finally, we return to our study of global reading behavior. Consistent with the results of Lemmerich et al., we find that readers in countries with lower HDI or in the so-called Global South spend more time reading per page view compared to readers in the Global North or in countries with higher HDI \citep{lemmerich_why_2019}. Moreover, this difference is amplified where we would expect users to consume information in depth: on the desktop (non-mobile) site. While we also hypothesized that the difference would likewise be greater in the last page-view in a session, this idea was not supported by our data analysis. We demonstrate these patterns using both multivariate regressions and a simple non-parametric analysis.
+
+% We also report on how page length, device use, and the property of being the last page view in a session relate to reading time.
+
+\section{Background}
+
+\subsection{Wikipedia readership}
+ 
+Reading behavior on Wikipedia has been studied extensively, with a 2014 literature review listing 99 publications by 2011 \citep{okoli_wikipedia_2014}.  Page view count data is central to this body of work when it comes to quantifying the attention readers give to particular topics or entire Wikipedia language editions. According to Priedhorsky et al., ``the most common application [of page view data] is detection and measurement of popular news topics or events,'' with other uses including forecasting attempts (of e.g. box office revenues) and the study of Wikipedia's own processes \citep{priedhorsky_measuring_2017}. As an example of research using it to examine information imbalances, building on earlier work by Gorbatâi and others, Warncke-Wang et al. compared page view data with article quality ratings, and found ``misalignment between supply and demand'', as the Wikipedia articles with the most views were often not the highest quality  \citep{gorbatai_exploring_2011, warncke-wang_misalignment_2015}.
+Other, less frequently used research strategies include using click streams and session lengths \citep{halfaker_user_2015, paranjape_improving_2016}. 
+
+%\cite{singer_why_2017}
+
+% let's elaborate on issues with self-selection bias in Wikipedia surveys down here. 
+Surveys are another important source of information about Wikipedia readership \citep{okoli_wikipedia_2014}. %p.23
+As mentioned in the introduction, such voluntarily self-reported data are subject to participation and social desirability biases.  Participation biases from self-selection may have had significant effects in the case of a previous Wikipedia reader and editor survey \citep{hill_wikipedia_2013}.
+
+Some previous research on Wikipedia readership has already used an approximation of reading time that assumes that the end of a page view is always marked by a new web-request originating from the same IP and user agent \citep{singer_why_2017}. Apart from the limitations arising from using the IP/user agent combination as a substitute for a user ID, this approach also does not allow measuring the dwell time for the last page view in a session.%the latter limitation shared with Google Analytics
+
+%We do not investigate these differences any further in this report because we lack knowledge of the specific contexts of each community and their audiences which would be necessary to adequately explain them. Instead, we present an analysis of the relationship between reading time and the development level of reader's countries to offer a more general explanation of one factor that might make a difference.  
+
+% TODO make sure that we are clear that dwell times == reading times
+\subsection{Dwell times and information seeking}
+It has long been observed that page view numbers can paint a misleading picture of the amount of attention spent by web readers, or the information value a web site provides to them. An early study of search engine users found in 2003 that typical reading times were ``substantially less than has been previously reported using survey data'' \citep{jansen_analysis_2003}. In more recent years, metrics based on page dwell time (or total time spent on a site) have been adopted more widely. 
+%[mention Google Analytics' "Time on Page" and its last-in session limitation here?]
+A prominent example is the online publishing platform Medium.com, which in 2013 declared ``Total Time Reading'' (TTR) as their ``Only Metric That Matters.'' Distancing themselves from widely adopted web analytics metrics such as page views or active users, they argue that the act of reading should be seen as the most relevant form of user engagement for content websites \citep{davies_mediums_2013}. 
+
+Much prior work on web page dwell times focuses on applications in information retrieval and content recommendation \citep[e.g.,][]{kim_modeling_2014,yi_beyond_2014,yin_silence_2013}.  Long dwell times can  signal  successful information retrieval in search applications because they suggest that the user has found sought information \citep{kim_modeling_2014}. Liu et al. analyzed dwell time data collected through a web browser plugin to characterize types of web content  \citep{liu_understanding_2010}. However, factors beyond content may influence dwell times including psychological processes of decision making and individualized styles of content consumption \citep{yin_silence_2013}. As we compare Wikipedia readers using mobile and desktop devices it is worth noting that dwell times are likely to be longer on desktop computers compared to mobile devices \citep{yi_beyond_2014}. 
+
+
+% Screen and glean
+% use in recommendation systems
+% psychology
+
+\subsection{Global device and knowledge gaps}
+%Geographic distribution of content production on Wikipedia
+% cultural context content
+% a lot of these ideas suggest that people with lower skills are less likely to engage.  
+
+% subsection device gaps
+% tie this to our hypotheses about mobile
+% We really need a theory of how information needs between 
+We seek to understand differences in Wikipedia's audience between the areas roughly known as the Global North and the Global South.  Lemmerich et al. show empirical differences between self-reported information seeking behavior between such contexts \citep{lemmerich_why_2019}. These differences are likely related to digital divides or gaps between the knowledge, information and technology resources commonly available in different contexts, which can lead to systematic differences in reading behavior. 
+
+For people to use the Internet (or Wikipedia), they have to be able to connect to it, but not all forms of access are equally suited for a given task \citep{deursen_toward_2015}. Deursen et al. suggest that personal computers will be better for in-depth information seeking, while mobile devices, which are often close at hand, have advantages for social interaction \citep{deursen_toward_2015}. As Internet access becomes more ubiquitous, gaps in skills and knowledge about how to use the Internet are increasingly salient digital dividers and can be reinforced by device gaps  \citep{deursen_compoundness_2017,hargittai_second-level_2002}. For instance, in many parts of the non-western world, mobile phones diffused before PCs, and skills for PC usage may be less widespread \citep{napoli_emerging_2014, pearce_digital_2013}. We contribute new information about the interaction between device use around the world and how people read Wikipedia. 
+
+Gaps in skills and knowledge may also help explain gaps in who contributes to Wikipedia \citep{shaw_pipeline_2018}. Wikipedia promises to advance over traditional modes of knowledge production in which dominant western attitudes shape what people and places will be included and how they will be represented in authoritative sources like encyclopedias \citep{graham_uneven_2014}. In theory, peer production can empower people around to the world to add their local knowledge of their places to Wikipedia. Yet even as global access to Wikipedia grows, it is slow to fulfill these promises. Gaps in coverage of cultural knowledge reflect and reinforce structural digital divides at many levels that ``disadvantage many of the world's informational peripheries'' \citep{graham_uneven_2014}.  These gaps in Wikipedia's coverage help motivate a better understanding of global readership. 
+
+
+
+%Differences in reading times across global contexts may relate to gaps in access, skills, or knowledge  needed to efficiently discover, filter, and interpret information.
+
+
+% Olga's badass paragraph
+In this paper, we use the Human Development Index (HDI) and the Global South/Global North regional classification as means of comparing countries separated by varying levels of development.  We recognize that both are insufficient for defining economic development.  Furthermore, these concepts and our measures of them only provide an incomplete understanding of the unique identities and motivations of cultures within an information-seeking context.  What's more, they do not take into consideration inequality within a geographic region due to minority populations, which may affect the utility of averages such as GDP, income, and life expectancy.  We hope that this work provides a basis of study that may be continued with work that takes into account individual cultural context, internet accessibility, and internal inequality.  
+
+%"content rich depth searches might be better conducted on personal computers or laptops while handheld devices might be most appropriate for using social media as a vehicle for social interaction."
+
+%\cite{buchi_modeling_2016}
+%\cite{scheerder_determinants_2017}
+% cite Hargattai on skills
+%\cite{shaw_pipeline_2018}
+
+
+
+\section{Methods}
+\subsection{Collecting reading time data}
+
+Our data collection instrument, the reading depth plugin
+%works by running JavaScript in the client browser which sends two messages to the server during a page view.\footnote{\url{https://meta.wikimedia.org/wiki/Schema:ReadingDepth}}  The first message is sent when the page is loaded and the second message is sent when it is unloaded. The page unloaded event sends values from timers that measure, among other things, the amount of time that the page was visible in the visitor's browser window. 
+%More specifically, the plugin
+uses the page visibility API to measure  \emph{time visible}, the total amount of time that the page was in a visible browser tab.\footnote{See \url{https://meta.wikimedia.org/wiki/Schema:ReadingDepth} \textit{archived at} \url{https://perma.cc/JK75-Y6DH} and   \url{https://developer.mozilla.org/en-US/docs/Web/API/Page\_Visibility\_API} \textit{archived at} 
+\url{https://perma.cc/79PB-389J}} The instrument also records a second candidate measure of reading time:  \emph{total time.} This is simply the entire time the page was loaded in the browser. We used this variable for data validation and in robustness checks.  We chose to focus on \emph{time visible} because it excludes time when the user could not possibly have been reading the page. This is similar to the client-side approach described in Yi et al. (2014) \citep{yi_beyond_2014}.
+
+Beginning November 20th 2017, we logged events from a 0.1\% sample of visitor sessions.\footnote{Sessions are based on a random identifier recorded in the browser's \textit{sessionStorage}, which expires at the end of each browser session. This is more privacy-friendly than the common approach (as used in e.g. Google Analytics) of tracking users via a cookie, in that the session identifier is not sent with every request to  Wikimedia servers. It also differs from session cookies in that a new identifier will be used for links opened in a new browser tab or window.} The sampling rate was increased to 10\% on September 25, 2018 to support future studies at a higher level of granularity.%\footnote{\url{https://phabricator.wikimedia.org/T205176}}
+
+Since we care about the reading behavior of humans, we identify bots using user agent strings and exclude them from all of our analyses.\footnote{See \url{https://meta.wikimedia.org/wiki/Research:Page_view/Tags\#Spider} \textit{archived at} \url{https://perma.cc/3NSL-X6L2}}
+
+\subsection{Missing data}
+We are only able to collect data from web browsers that support the APIs on which the instrument depends. Also, we excluded certain user agents that were found to send data unreliably in our testing, namely the default Android browser, versions of Chrome earlier than 39, Safari, and all browsers running on versions of iOS older than 11.3. We also do not collect data from browsers that have not enabled JavaScript or that have enabled Do Not Track.\footnote{See \url{https://en.wikipedia.org/wiki/Do_Not_Track} \textit{archived at} \url{https://perma.cc/J368-ZYBD}}
+
+Even when the above conditions are met, in some cases we are still not able to collect data. Sometimes we observe a page loaded event, indicating that a user in our sample opened a page, but we do not observe a corresponding event indicating that the user has left the page (a page unloaded event). This issue affects 57\% percent of records on the mobile site and about 5\% of records on the desktop site. The likely explanation for why many mobile views are affected is that many mobile browsers will fail to send a page-unloaded event in certain situations, such as when the user closes the browser app using the app switcher.\footnote{We are planning to remedy this issue in future versions of the instrumentation, by making use of alternatives to the page unloaded event available in modern browsers, e.g. the Page Lifecycle API introduced in Google Chrome in 2018.
+%(https://phabricator.wikimedia.org/T219212)
+}  We only include page views for which we observe exactly 1 page loaded event and 1 page unloaded event and remove 0.016\% of page unloaded events where, for unknown reasons, the instrument recorded a page visible time that was less than 0 or undefined.
+
+\subsection{Taking a sample}
+
+Because Wikipedia is so widely read, even a 0.1\% sample results in an amount of data exceeding the statistical requirements of this analysis. We therefore conduct our analysis on random sub-samples of the collected data. 
+
+To ensure that all Wikipedia language projects are fairly and adequately represented in our sample, we use stratified sampling by assigning a \emph{weight} to each group that adjusts the probability that members of the group are chosen in the sample. This introduces a \emph{known bias} in the resulting sample, which is corrected using the \emph{weights} in ways analogous to weighted averaging.
+For estimating total reading time, and for distribution selection, we stratify by wiki, taking up to 20,000 data points for each wiki and excluding wikis that have fewer than 300 data points. This leaves us with 242 wikis in our sample. In the multivariate analysis below, we stratify by wiki, by the country of the reader's approximate location, and by whether or not we think that the user is on a mobile device. We sample up to 200 data points for each stratum and analyze a sample of 285 wikis.
+
+% per Casey Fiesler this should go in the methods section
+\subsection{Ethical considerations}
+Our approach in this paper relies on large-scale observational data collected by monitoring the behavior of Wikipedia visitors.  We neither see nor speak to the humans on the other side of the screen. In addition to the empirical limitations discussed below, this approach is subject to epistemic limitations. It makes those behaviors that we can observe through browser APIs visible, while obscuring those we cannot. It cannot speak to how people in different countries understand their experience of Wikipedia \citep{graham_geography_2013}. Furthermore, ``big data'' approaches carry critical and novel ethical risks that are not easily understood in conventional informed-consent and human subjects research frameworks \citep{boyd_critical_2012}. 
+
+Wikimedia's privacy policy endeavors to clearly communicate that the information we use here will be collected, but we do not consider this an ethical license to use this data however we see fit.\footnote{See  \url{https://foundation.wikimedia.org/wiki/Privacy_policy} \textit{archived at} \url{https://perma.cc/C4VQ-HWRT}} We chose an analysis that we believe poses minimal risk to Wikipedia visitors' expectations, trust, and autonomy \citep{fiesler_participant_2018}.\footnote{We followed the Wikimedia Foundation's (WMF) guidelines and processes for conducting research. As it is not a federally funded institution, research at the WMF is not supervised by an institutional review board (IRB).} 
+Each observation of individuals in our study was aggregated with many others at a high level of granularity. We chose to study the country level partly because our geolocation measure is most accurate at that level, but also because it is very coarse. We do not track people from one session to another, and do not look at the content of the pages they visit other than the page length. We exclude people from our analysis who indicate a wish for privacy by enabling Do Not Track in their browsers, and will discard any session identifiers remaining in the data collected for this analysis after it is complete.
+
+%We acknowledge that these steps do not guarantee that everyone whose behavior we analyze would be comfortable with traces of their activity being so used.  
+
+
+\begin{figure}[t]
+\begin{center}
+%\includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/1.png}
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\columnwidth]{figures/histograms_1-1} 
+\end{knitrout}
+
+\end{center}
+
+\caption{The distribution of dwell times across 242 language editions of Wikipedia.  The top chart shows a histogram of dwell times less than one hour long (the x-axis is truncated to 300 seconds for clarity). In this chart we can see that the median dwell time is about 25 seconds long and that the distribution of dwell times is very skewed, with the arithmetic mean far from the median. The y-axis represents the probability that a given page view is in a given box. In the lower figure, the dwell times are log-transformed and the data appear bell-shaped, with some skew to the right.}
+\label{fig:hists}
+\end{figure}
+
+\section{Distribution of reading times}
+Here we present summary statistics and a high level description of reading behavior on Wikipedia in terms of dwell times. When someone opens a given page on Wikipedia, how long do they typically stay on the page? Are reading times highly skewed? How much does reading behavior vary across different language editions of Wikipedia?  How much time does all of humanity spend reading Wikipedia? 
+
+\subsection{Wikipedia as a whole}
+
+In general, the distribution of reading times is very skewed (see \hyperref[fig:hists]{Figure \ref*{fig:hists}}). The median reading time is 25 seconds and the 75th percentile is 75.1 seconds. This skewness pushes the arithmetic mean far from most of the mass of the distribution. Therefore, the geometric means, medians, and other percentiles have more utility within our discussion of reading times.
+
+\subsection{Total time spent}
+
+Based on our data, we estimate that humanity spent about \emph{ 672,349 years} reading Wikipedia from November 2017 through October 2018. We calculated this estimate as the product of the mean reading time on each Wikipedia wiki by the number of page views on that wiki, excluding readers using the mobile apps and identified bots. It is possible that some people leave Wikipedia pages visible in their browsers for extended periods of time without reading. To make our estimates of total reading time in this section somewhat conservative, we rounded all page views down to 1 hour.
+
+\subsection{Variation between different language editions}
+
+
+% We hope these plots will be readers who may wish to know how reading times compare between their wikis of interest. 
+\hyperref[fig:kernelplots]{Figure \ref*{fig:kernelplots}} shows kernel density estimates of the distribution of page visible times on several Wikipedia language editions selected to highlight projects of different sizes and of different cultures. These are Arabic (ar), German (de), English (en), Spanish (es), Hindi (hi), Dutch (nl) and Punjabi (pa). As above, we place unscaled data side-by-side with log-transformed data. Only the log-transformed plots show the full range of the data. Similar kernel density plots for other languages as well as box-and-whisker plots are available in our online supplement.\footnote{Available at \url{https://w.wiki/5Jo}.}
+
+\begin{figure}
+
+%\includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/4.png}
+\centering
+
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\columnwidth]{figures/kernelplots-1} 
+\end{knitrout}
+\caption{Kernel density plots of the distribution of dwell times on a selection of wikis. Spanish, Hindi, and Arabic appear to have longer reading times while English and Punjabi appear to have somewhat shorter reading times. In general, the distribution is very skewed, as these example wikis demonstrate.\label{fig:kernelplots}} 
+
+\end{figure}
+
+%We observe a great deal of variation in the distribution of reading times between different language editions. 
+
+\begin{table}[t]
+\centering
+\begin{tabular}[]{@{}llllll@{}}
+\toprule
+wiki & 5\% & 25\% & 50\% & 75\% & 95\% \\
+\midrule
+all wikis & 1.8 & 8.0 & 25.0 & 75.1 & 439.1 \\
+ar & 5.2 & 5.2 & 21.5 & 69.9 & 371.7 \\
+de & 14.1 & 14.1 & 14.1 & 56.6 & 482.7 \\
+en & 37.2 & 37.2 & 37.2 & 37.2 & 262.4 \\
+es & 23.3 & 23.3 & 23.3 & 65.5 & 616.4 \\
+hi & 2.5 & 11.4 & 31.4 & 82.6 & 360.5 \\
+nl & 6.1 & 6.1 & 15.9 & 60.1 & 441.8 \\
+pa & 2.0 & 7.2 & 19.5 & 55.4 & 303.1 \\
+\bottomrule
+\end{tabular}
+\caption{Percentiles for reading times (in seconds) on selected Wikipedia editions\label{tab:wikilangpercentiles}}
+\end{table}
+
+ 
+\section{Univariate model selection}
+
+\subsection{Motivation}
+
+Analysts of reading times on Wikipedia will wish to make parametric assumptions to justify the use of statistical models for evaluating experiments, drawing comparisons between different samples of reading times, and performing multivariate analyses as we do below. This requires assuming
+a probability distribution with interpretable parameters such
+as mean, variance, and shape parameters. Fitting parametric
+distributions to data allows us to estimate these parameters
+and to statistically test changes in the parameters. However,
+parametric models can mislead if they don't fit the data well. Below, we evaluate several models. 
+
+
+\subsection{Candidate models}
+We consider the following distributions in our model-selection process.
+
+\textbf{Log-normal distribution:} This is a normal distribution, but on a logarithmic scale. Differences in means between log-normal samples can be tested using t-tests.  Such advantages make the log-normal distribution a common choice in analyzing skewed data, even when it is not a perfect fit.
+ 
+% This gives it convenient properties because its parameters the mean and variance of the log-transformed data. 
+
+\textbf{Lomax (Pareto Type II) Distribution:} Datasets on human behavior often exhibit power-law distributions, meaning that the probability of extreme events, while still low, is much greater than would be predicted by a normal (or log-normal) distribution \citep{clauset_power-law_2009}. We fit the Lomax Distribution, a commonly used long-tailed distribution with two parameters that assumes that power law dynamics occur over the whole range of the data. 
+ 
+\textbf{Weibull Distribution:} Liu et al. model reading times on web pages using a Weibull Distribution \citep{liu_understanding_2010}. This model has two parameters: {\bfseries {${\displaystyle \lambda }$}}, a scale parameter, and {\bfseries {${\displaystyle k}$}}, a shape parameter. The Weibull distribution can be a useful model because of the intuitive interpretation of {\bfseries {${\displaystyle k}$}}. If {${\displaystyle k>1}$}, then reading behavior exhibits  positive aging, which means that the longer someone stays on a page, the more likely they are to leave the page at any moment. Conversely {${\displaystyle k<1}$} is interpreted as negative aging, which means that as someone remains on a page, they become less likely to leave the page at any given moment. The Weibull distribution is often used in the context of reliability engineering for modeling the chances that a given part will fail at a given moment. 
+
+\textbf{Exponentiated Weibull Distribution:} The Weibull model assumes that the rate of readers leaving a page changes monotonically over time. This implies there must be either negative aging, positive aging, or no aging. It excludes more complicated dynamic processes where positive aging gives way to negative aging after a point in time. The exponentiated Weibull distribution is a three-parameter generalization of the Weibull distribution that relaxes this constraint \citep{pal_exponentiated_2006}. The extra degree of freedom will allow this model to fit a greater range of empirical distributions compared to the two-parameter Weibull model.
+
+ %Therefore if the data show that the likelihood of a reader leaving a page first increases and then decreases (or vice versa) then assumptions of the Weibull model are violated.  
+ 
+ 
+%We also considered the \url{https://en.wikipedia.org/wiki/gamma_distribution}{gamma distribution} and the \url{https://en.wikipedia.org/wiki/exponential_distribution}{exponential distribution}, but we will not go into depth about them here. We didn't have a strong motivation for these models and they did not fit the data well.
+
+\subsection{Methods}
+
+Our method for model selection is inspired in part by Liu et al., who compared the log-normal distribution to the Weibull distribution of dwell times on a large sample of web pages \citep{liu_understanding_2010}. They fit both models to data for each web page, and then compare two measures of model fit: the log-likelihood, which measures the probability of the data given the model (higher is better), and the Kolmogorov-Smirnov distance (KS-distance), which is the maximum difference between the model CDF and the empirical CDF (lower is better). For the sample of web pages they consider, the Weibull model outperformed the log-normal model in a large majority of cases according to both goodness-of-fit measures. 
+
+Similar to the approach of Liu et al., we fit each of the models we consider on reading time data, separately for each Wikipedia project \citep{liu_understanding_2010}. In addition to the KS-distance, we also use KS-tests of the null hypothesis that the model is a good fit for the data to evaluate goodness-of-fit \citep{clauset_power-law_2009}. For the samples sizes we use, passing the KS-test is a high bar. 
+
+Adding parameters can increase model fit without improving out-of-sample predictive performance or explanatory power.  To make fair comparison between models with different numbers of parameters, we use the Akaike information criterion (AIC) and the Bayesian information criterion (BIC) instead of the log-likelihood. Both criteria attempt to quantify the amount of information lost by the model (lower is better), by evaluating the log likelihood, and adding a penalty for the model parameters. The difference between AIC and BIC is that BIC maintains the penalty for larger sample sizes.\footnote{We provide a more detailed example of this procedure in our online supplement at \url{https://w.wiki/5Jo}.}
+
+%This allows us to go beyond Liu et al.  by evaluating whether each distribution is a plausible model, instead of just whether one distribution is a better fit 
+
+%We also use the KS distance to evaluate goodness-of-fit.  The
+%The KS-test is quite sensitive to deviations between the model and the data, especially in large samples.  than another.
+
+%compare two distributions that each have 2 parameters, but the models we consider have different numbers of parameters (the exponentiated Weibull model has 3 parameters and the exponential model has only 1). Adding parameters can increase model fit without improving out-of-sample predictive performance or explanatory power. To avoid the risk over-fitting and to make a fair comparison between models we 
+
+Following Liu et al., we build these goodness-of-fit measures for each wiki and rank them from best to worst \citep{liu_understanding_2010}. For each distribution, we report the mean and median of these ranks. In addition, we report the mean and median p-values of the KS-tests as well as the number and proportion of wikis that pass the KS-test for each model. 
+
+We fit the models using SciPy. The exponentiated Weibull, Weibull, and Lomax models were fit using maximum likelihood estimation and the log-normal distributions were fit using the method of moments.
+
+
+%We also use diagnostic plots to compare the empirical and modeled distributions of the data in order to explain where models are failing to fit the data.  Because the data is skewed, we log the X axis of these plots.
+
+%The diagnostic plots are shown with data on English Wikipedia. On this wiki, the exponentiated Weibull model is the best fit, followed by the Lomax model and then the log normal model and only the exponentiated Weibull model passes the KS test.
+
+\section{Results}
+
+%\subsection{Goodness-of-fit metrics}
+
+\hyperref[tab:gof]{Table \ref*{tab:gof}} below shows the results of this procedure. The Lomax, exponentiated Weibull, and  Log-normal all fit the data reasonably well. All pass the KS-test for many wikis, and are in a three-way tie for best median rank according to AIC.  Despite this, none of our candidate models  pass the KS test for all wikis: There are 28 wikis where all 4 models fail to pass at the 95\% level, and 13 wikis where they all fail at the 97.5\% level. 
+
+% this table is broken
+\begin{table*}
+\centering
+\begin{footnotesize}
+\centering
+\begin{tabular}[]{@{}lllllllllllll@{}}
+\toprule
+\emph{model} & \multicolumn{2}{c}{AIC rank} & \multicolumn{2}{c}{BIC rank} & \multicolumn{2}{c}{ks rank} & \multicolumn{2}{c}{KS p-value} &
+\multicolumn{2}{c}{KS 95\%} & \multicolumn{2}{c}{KS 97.5\%} \\
+\midrule
+&               mean & med. & mean & med. & mean & med. & mean & med. & mean & passing & mean &  passing \\
+Lomax &          1.78 & 2 & 1.70 & 1 & 2.09 & 2 & 0.26 & 0.17 & 0.79 & 192 & 0.87 & 211 \\
+Log-normal &     2.20 & 2 & 2.10 & 2 & 2.33 & 2 & 0.27 & 0.17 & 0.71 & 173 & 0.79 & 191\\
+Expon. Weibull & 2.15 & 2 & 2.34 & 3 & 2.11 & 2 & 0.29 & 0.23 & 0.77 & 187 & 0.84 & 203 \\
+Weibull &        3.98 & 4 & 3.94 & 4 & 3.84 & 4 & 0.07 & 0.00 & 0.24 & 59 & 0.30 & 72\\
+\bottomrule
+\end{tabular}
+\end{footnotesize}
+\caption{Goodness of fit statistics resulting from the model selection process on 242 wikis.  The Lomax, log-normal, and exponentiated Weibull distributions fit the data reasonably well, but the Lomax most often fits the best. The "mean" columns under KS 95\%, and KS 97.5\% refer to the proportion of wikis passing KS-tests at the 95\% and 97.5\% significance levels, and the "passing" columns states the absolute number. \label{tab:gof}} 
+\end{table*}
+
+The Lomax distribution is the best fit across all wikis according to all metrics. With only 2 parameters, it has a lower AIC and BIC than the three-parameter exponentiated Weibull distribution and passes the KS-test 79\% of the time at the 95\% confidence level.
+The exponentiated Weibull model fits the data better than the log-normal model in terms of passing KS-tests and with respect to AIC. However, the log-normal is better in terms of BIC, which imposes a greater penalty on the additional parameter of the exponentiated Weibull model.
+
+The Weibull model fits substantially worse than the Lomax, log-normal, and exponentiated Weibull in terms of all of our goodness-of-fit metrics. In this respect, our results differ from those of Liu et al., who observed the Weibull model fitting dwell time data better than the  Log-normal model \citep{liu_understanding_2010}. We observe that for dwell times on Wikipedia, the Log-normal model is the better fit. While substantially worse than the Lomax model, the Log-normal model still passes the KS-test at the 95\% level for about 71\% of wikis in the sample. 
+
+\subsection{Discussion}
+
+We found that the Lomax, exponentiated Weibull, and log-normal models all fit the data within reason. We now discuss how each of these models can be applied to understanding Wikipedia reading behavior. 
+
+\textbf{Lomax (Pareto Type II) Distribution:} That the Lomax model fits well suggests that Wikipedia reading times may follow a power law. Mitzenmacher (2004) describes several possible data generating processes for power law (Pareto) and log-normal distributions \citep{mitzenmacher_brief_2004}.    Rich-get-richer dynamics such as preferential attachment are commonly associated with power law distributions, and a mixture of Log-normal distributions can also generate a power law \citep{mitzenmacher_brief_2004}. Deeper exploration of potential power-law dynamics in reading behavior is a potential avenue for future research. 
+
+%On the other hand, it is intuitive that a mixture of different log-normal processes may be involved in reading time, such as an exploration process mixed with a reading process or even a mixture of behavior patterns associated with different types of information consumption.  
+ 
+ \textbf{Log-Normal Distribution:} The log-normal model does not fit the data perfectly, but it fits well enough to be useful. It frequently passes KS-tests, and is preferred to the exponentiated Weibull by the BIC. Even though the Lomax model typically fits the data better, assuming a log-normal model justifies using t-tests to compare differences in geometric means when evaluating experiments. Furthermore, assuming log-normality can help justify using ordinary least squares to estimate regression models in multivariate analysis (as we do below) instead of  models that require maximum likelihood estimation. 
+ 
+ \textbf{Weibull Distribution:} The Weibull model did not fit the data well. %This was somewhat disappointing because we had hoped to analyze reading behavior in terms of the inferred parameter that indicates positive or negative aging.
+ While Liu et al. observed that the Weibull model out-performed the  log-normal model on their datasets, we (along with \citet{yin_silence_2013}) observe the opposite. However, the exponentiated Weibull model generalizes the Weibull, is a good fit for the data, and can help us explain why the Weibull does not fit the data well.  
+ 
+ \textbf{Exponentiated Weibull Distribution:} The exponentiated Weibull has 3 parameters \citep{pal_exponentiated_2006}. Two are shape parameters ({${\displaystyle \alpha >0}$} and {${\displaystyle \gamma >0}$}) and one is a scale parameter ({${\displaystyle \lambda >0}$}). The major qualitative distinctions in interpreting the model depend on the shape parameters.  In many cases the parameters can be interpreted in terms of a transition from negative to positive aging (or visa-versa) after some threshold.  However, if either {${\displaystyle \gamma >1}$}, {${\displaystyle \alpha <1}$} or  {${\displaystyle \gamma <1}$}, {${\displaystyle \alpha >1}$} then qualitative interpretation may require closer inspection of estimated hazard functions. 
+ 
+Inconveniently, we estimated {${\displaystyle \alpha >1}$} and {${\displaystyle \gamma <1}$} for all but one of the 285 Wikipedia projects we analyzed. This limits the usefulness of exponentiated Weibull models for large-scale analysis on many wikis because the parameters are outside the area where the model leads directly to intuitive qualitative interpretations. However, by plotting the estimated hazard function we can see over what range of the data the hazard function is decreasing or increasing, accelerating or decelerating. 
+
+\begin{figure}
+
+
+%\includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/5.png}
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\columnwidth]{figures/hazardplot-1} 
+\end{knitrout}
+
+\caption{Hazard functions for the parametric models estimated on English Wikipedia.  The exponentiated Weibull model (the best fit to the data) indicates that the hazard rate increases in the first seconds of a page view, after which we observe negative aging.\label{fig:hazards}}
+
+\end{figure}
+
+In \hyperref[fig:hazards]{figure \ref{fig:hazards}} we observe that, on English Wikipedia, the log-normal and exponentiated Weibull models both indicate a brief period of positive aging, during which the instantaneous rate of page-leaving increases, followed by negative aging. This helps explain why the Weibull model is not a good fit for the data compared to the log-normal and exponentiated Weibull models: the Weibull distribution cannot model a non-monotonic hazard function.  While Liu et al. found it to be a good model for the distribution of dwell times in data collected through a web browser plugin, our analysis suggests that the behavior of Wikipedia readers may be somewhat more complex.  Perhaps whereas Liu et al. operationalized ``screen-and-glean'' as a monotonically decreasing hazard function, Wikipedia readers require more than 1 or 2 seconds to "screen" the page and during these first few moments, their hazard of leaving it increases. 
+
+%This proposition can be tested if the use of a feature that provides information about the content of a page before it is opened (such as page previews) leads to monotonically decreasing hazard rates.\footnote{https://www.mediawiki.org/wiki/Page\_Previews}% I don't understand the preceding sentence. --TB
+
+%\subsubsection{Distribution fitting plots}
+
+%To further explore how well these distributions fit the data, we present a series of diagnostic plots that compare the empirical distribution of the data with the model predicted distributions. For each of the four models under consideration (Lomax,  Log-normal, exponentiated Weibull, Weibull), we present a  density plot, a distribution plot, and a quantile-quantile plot (Q-Q plot). The density plots compare the probability density function of the estimated parametric model to the normalized histogram of the data.  Similarly the distribution plots compare the estimated cumulative distribution to the empirical distribution.  The Q-Q plots plot the values of the quantile function for the data on the x-axis and for the estimated model on the y-axis. These plots can help us explain diagnose ways that the data diverge from each of the models.  We present the x-axis of all these plots on a logarithmic scale to improve the visibility of the data. 
+%We show these plots for data from English Wikipedia. For this wiki, the liklihood-based goodness-of-fit measures indicate that the exponentiated Weibull model is the best fit (BIC = 19321) followed in order by the Lomax (BIC = 19351), the  Log-normal (BIC = 19373) and the Weibull (BIC = 20111), but the log-normal model is the only model that passes the KS test ({${\displaystyle p}$} = 0.089). 
+
+
+% {\scalefont{0.52741}\begin{longtable}{>{\RaggedRight}p{0.47143\linewidth}>{\RaggedRight}p{0.47143\linewidth}} 
+% \hspace*{0pt}\ignorespaces{}\hspace*{0pt}\begin{center}\uline{}\begin{minipage}{1.0\linewidth}\begin{center}\includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/6.png}\end{center}\myfigurewithoutcaption{6}\end{minipage} \textbf{Figure 2.2.{\mbox{$~$}}}\emph{ The Lomax model accurately estimates the rate of long reading times, but its monotonic density overestimates the probability of very short reading times and underestimates that of reading times in the range of 1 110 seconds.}\end{center}\begin{center}\uline{}\begin{minipage}{1.0\linewidth}\begin{center}\includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/7.png}\end{center}\myfigurewithoutcaption{7}\end{minipage} \textbf{Figure 2.4.{\mbox{$~$}}}\emph{ The Exponentiated Weibull model fits the data somewhat better than the  Log-normal model, but still overestimates the occurrence of very short reading times.}\end{center}&\hspace*{0pt}\ignorespaces{}\hspace*{0pt}\begin{center}\uline{}\begin{minipage}{1.0\linewidth}\begin{center}\includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/8.png}\end{center}\myfigurewithoutcaption{8}\end{minipage} \textbf{Figure 2.3.{\mbox{$~$}}}\emph{ The  Log-normal model fits the data well, but overestimates the probability of very short reading times and underestimates the probability of very long reading times.}\end{center}\begin{center}\uline{}\begin{minipage}{1.0\linewidth}\begin{center}\includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/9.png}\end{center}\myfigurewithoutcaption{9}\end{minipage} \textbf{Figure 2.5.{\mbox{$~$}}}\emph{ The Weibull model is not a good fit for the data. On a log scale, the PDF is not only monotonically decreasing, it is concave up everywhere. It greatly overestimates the probability of very short and very long reading times while under estimating the probability of reading times between 10 and 1000 seconds.}\end{center} 
+% \end{longtable}
+% }
+
+%\cite{graham_warped_2008} argues that despite the metaphor of cyberspace as transcending space and time, that information technology 
+
+\section{Reading time and global contexts}
+% each of these are 1-3 paragraphs on things other people have done and 1 short paragraph about how we contribute
+Now we return to our analysis of Wikipedia readers in a global context. Our analysis is most closely inspired by Lemmerich et al.'s large-scale global survey of Wikipedia readers. They found that readers in lower-HDI countries are more likely to use Wikipedia in educational contexts and for intrinsic learning, but not for fact-checking \citep{lemmerich_why_2019}.  Such motivations and contexts are likely to involve longer sessions and dwell times compared to fact-checking \citep{lemmerich_why_2019, singer_why_2017}.
+Therefore, we predict that readers in lower-HDI countries and in the Global South are more likely to have longer dwell times on Wikipedia articles. 
+
+\textbf{H1:} Readers in countries with lower HDI (or the Global South) are more likely to spend more time reading each page they visit compared to readers in countries with higher HDI (or the Global North).
+
+%Despite the prior evidence in favor of \textbf{H1}, distributions of device gaps, reader fluency, internet skills, and internet connectivity might drive reading times in less developed and global South countries compared to more developed and Global North countries.  We attempt to build confidence that such factors do not drive the observed relationship in two ways. First we use multivariate regression to statistically control for observable factors such as device load time, device type, and page length. We also attempt to increase support for the theory that different kinds of information seeking drive the reading time gap by testing additional predictions of it \cite{stinchcombe_constructing_1987}. 
+
+We also test a second prediction of the theory that Global South readers are more likely to use Wikipedia for in-depth understanding.  If desktop devices have advantages for reading to gain in-depth understanding then users may be more likely 
+to choose these devices for such tasks (when they have the choice). Furthermore, Global South readers may also experience gaps limiting their access to desktop devices, and when they do have access may be likely to take advantage of such opportunities by reading longer.  Therefore, we expect users in countries within the Global South designation (or with lower HDIs) to read even longer on desktop devices. 
+
+\textbf{H2:} The difference between the reading times of readers in countries with lower HDI compared to readers in higher-HDI countries will be greater on desktop than on mobile devices.
+
+Based on the ``screen-and-glean'' model of information seeking behavior that Liu et al. observed on the web \citep{liu_understanding_2010}, we propose that reading of articles for in-depth understanding is most likely to take place in the last page view in a session.  Differences in reading time in other page views might be attributable to less efficient ``screening''---gaps in the skills required to efficiently sift through Wikipedia pages to find the page with the information sought.  However, the final page view in a session may reflect ``gleaning''---information consumption. If so, then the last page view in a session provides an opportunity to isolate information consumption from information seeking.
+
+Therefore if the gap between low and high development context readers is attributable to types of information seeking tasks, and in-depth reading tasks require more time spent ``gleaning,'' then we predict that the gap between reading time in low versus high HDI countries will also be amplified on the last page view in a session.
+
+\textbf{H3:} The difference between reading times in countries with lower HDI and countries with higher HDI will be greater on the last page view in a session than on other page views.
+
+On the other hand, a ``skills gap'' with respect to information screening may drive an opposite result. The gap between reading times in the Global South and the Global North may shrink on the last page view in a session if Global South readers are less efficient at filtering information.
+
+\subsection{Methods and measures}
+
+The EventLogging system records the date and time the page was viewed.
+We include \emph{Day-Of-Week} and \emph{Month} as statistical controls for seasonal and weekly reading patterns.  Including  \emph{NthInSession} statistically adjusts for the number of pages a reader has viewed so far in the session.   \emph{Revision Length}, the size of the wiki-page, measured in bytes, roughly accounts for the amount of content on the page. We use two other measures from the instrument to statistically adjust for page load time:  \emph{time till first paint}, the time from the request until the browser starts to render any part of the page; and \emph{dom interactive time}, the time from the request until the user can interact with the page.\footnote{See \url{https://developer.mozilla.org/en-US/docs/Web/API/PerformanceNavigationTiming/domInteractive} \textit{archived at} \url{https://perma.cc/RRA8-8SQG}, DOM refers the page's ``document object model'' structure} 
+
+We obtain the \emph{page length}, measured in bytes at the time the page was viewed, by merging the EventLogging data with the edit history. To understand how reading behavior on  \emph{mobile} devices differs from behavior on non-mobile (i.e. desktop) devices, we assume that visitors to mobile web-hosts (e.g. en.m.wikipedia.org) are using mobile devices and that visitors to non-mobile web-hosts (e.g. en.wikipedia.org) are on non-mobile (desktop) devices.
+
+We determine the approximate country in which a reader is located from the MaxMind GeoIP database which is integrated with the Wikimedia analytics pipeline.\footnote{See \url{https://wikitech.wikimedia.org/wiki/Analytics/Systems/Cluster/Geolocation} \textit{archived at} \url{https://perma.cc/C36T-2E4E}} We use the United Nations' human development index (\emph{HDI}) to measure the development level of the country.\footnote{From \url{http://hdr.undp.org/en/data} \textit{archived at} \url{https://perma.cc/SLQ3-HS8S}. The HDI is a number between 0 and 1.} We lack geolocation data before March 3rd 2018, which limits our analysis of reading times in the global context to the period from then until September 28th 2018. We standardize the HDI by centering to 0 and scaling it by the standard deviation (taken at the country level) because the partial residual plots of interaction term between (unscaled) HDI and mobile were very skewed. This also allows us to interpret results in terms of standard deviations.
+
+We also use the established regional classifications of Global North and Global South\footnote{See \url{https://meta.wikimedia.org/wiki/List_of_countries_by_regional_classification} \textit{archived at} \url{https://perma.cc/WHN7-GB9D}} as a second, dichotomous, measure of development. Finally, the EventLogging instrumentation retains a session token with which we measure whether or not a given page view is the \emph{last-in-session.} We also statistically adjust for the number of pages viewed in the session so far (\emph{Nth in session}).  
+
+\subsubsection{Models}
+We test the three hypotheses using two regression models that differ only in how they represent economic development.  \emph{Model 1a} uses the human development index (HDI) and \emph{model 1b} uses the Global North / Global South regional classification.  Here is the specification of \emph{model 1a}: 
+
+\begin{small}
+
+\[ Y=B_{0}+B_{1}HDI+B_{2}Mobile+B_{3}Mobile~x~HDI\]
+\[+ B_{4}RevisionLength+B_{5}DayOfWeek+B_{6}Month\]
+\[+ B_{7}NthInSession+B_{8}LastInSession\]
+\[+B_{9}HDI~x~LastInSession+B_{10}Mobile~x~LastInSession\]
+\[+B_{11}FirstPaint+B_{12}DomInteractiveTime\]
+
+% \textbf{Model 1b:} 
+% \[Y=B_{0}+B_{1}GlobalNorth+B_{2}Mobile+B_{3}Mobile:GlobalNorth\]
+% \[B_{4}RevisionLength+B_{5}DayOfWeek+B_{6}Month\]
+% \[+ B_{7}NthInSession+B_{8}LastInSession\]
+% \[+B_{9}GlobalNorth:LastInSession+B_{10}Mobile:LastInSession\]
+% \[+B_{11}FirstPaint+B_{12}DomInteractiveTime\]
+\end{small}
+
+The formula for \emph{model 1b} is the same except for using \emph{GlobalNorth} terms instead of \emph{HDI}.
+
+We consider \textbf{H1} supported if {${\displaystyle B_{1}<0}$} in both models;  \textbf{H2} if {${\displaystyle B_{3}>0}$}; and  \textbf{H3} if {${\displaystyle B_{9}<0}$}.  Because interaction terms can be difficult to interpret qualitatively, we will present marginal effect (ME) plots to assist in qualitative interpretation of the observed relationships \citep{pepinsky_visual_2018}.
+
+We explored alternative model specifications that include higher order terms and additional interaction terms. We choose to present \emph{model 1a} and \emph{model 1b} because more complex models neither substantively improve the explained variance and the predictive performance nor lead to qualitatively different conclusions.  We fit both models using weighted ordinary least squares estimation in R on a stratified sample of size 9,873,641. 
+
+
+\subsection{Non-parametric Analysis}
+
+Our multivariate regression analysis assumes a parametric model and as we saw in the univariate analysis above, the assumption of log-normality may not be valid for every Wiki. Therefore, we also provide a simple non-parametric analysis based on median reading times.  Unlike the regression analysis, the non-parametric analysis does not include statistical controls or afford statistical hypothesis tests, but it avoids having to depend on assumptions about the distribution. We construct a 3x3 table of users depending on whether they are in the Global North or Global South, on a mobile or desktop device, or on the last page view in their session. The medians of each cell of the table validate that our findings are not driven by the normality assumption alone.
+
+
+
+\section{Results}
+
+%\subsection{Regression Analysis}
+
+We use marginal effects (ME) plots to interpret our regression models.\footnote{Full regression tables are available in the appendix.}  A marginal effects plot shows how the model's predicted outcome varies with respect to one or more of the predictors when other terms of the model are held constant at some typical value \citep{pepinsky_visual_2018}. Since we are interested in comparing reading times between last-in-session page views and other page views, we create two marginal effects plots for each model: one for last-in-session page views and one for non-last-in-session page views. Similarly, we also break down predicted reading times by device type. 
+
+For each marginal effects plot, the y-axis shows the model predicted values and the x-axis shows the values of the predictor variables. In the marginal effects plots shown here, uncertainty intervals represent confidence intervals of the parameter estimates, not uncertainty about the model predictions.  Uncertainty about model predictions in this case is generally very high, as our models explain only a small fraction (about 7\%) of the variance in reading times. 
+
+% start out just considering mobile devices, non-last-in-session
+\subsection{Hypothesis 1: Global context and reading times}
+
+We find support for  \textbf{H1}: that readers in higher-HDI countries ({${\displaystyle \mathrm {B} =-0.20,~SE=0.002}$}) or in the Global North ({${\displaystyle \mathrm {B} =-0.27,~SE=0.002}$}) are likely to spend less time on each page than readers in lower HDI countries or in the Global South. For illustration, our ME plot for \emph{model 1a} (\hyperref[fig:model1aplot]{figure \ref*{fig:model1aplot}}) shows that, for non-last-in-session page views, a prototypical reader on a desktop device in a country with an HDI one standard deviation below the mean is predicted to spend about 25 seconds on a given non-last-in-session page view compared to the predicted 18 seconds spent by an average reader in a country with an HDI one standard deviation above the mean.  Similarly, per our ME plot for \emph{model 1b} (\hyperref[fig:model1bplot]{figure \ref*{fig:model1bplot}}), for last-in-session page views on desktop devices, a prototypical Global North reader is predicted to spend around 42 seconds per page view compared to the 50 seconds spent by a prototypical Global South reader. 
+
+%Similarly, model 1b predicts that a user in Global South country will spend 130\% as much time reading a page as an equivalent reader in a Global North country. 
+
+% \begin{figure}
+% \includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/11.png}
+% \caption{ Marginal effects plot showing how the time spent on pages depends on the development level of the country they are in.}
+% \end{figure}
+
+% \begin{figure}
+% \includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/12.png}
+% \caption{Marginal effects plot showing how the time spent on pages depends on the development level of the country they are in.}
+% \end{figure}
+
+
+% \begin{figure}
+% \includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/16.png}
+% \caption{Marginal effects plot showing how the time spent on pages depends on whether a reader is on whether they are on their last page view in a session, and the development level of the country they are in.}
+% \end{figure}
+\begin{figure}[t]
+\centering
+\begin{knitrout}
+\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\includegraphics[width=\columnwidth]{figures/model1aplot-1} 
+\end{knitrout}
+\caption{\label{fig:model1aplot} Marginal effects plot showing the relationship between HDI and reading time predicted by \emph{model 1a}. The negative slope of the lines shows that lower-HDI readers have longer reading times, and the difference in slopes between devices shows that the relationship between HDI and reading time is more pronounced on desktop devices. The ribbons reflect 95\% confidence intervals of the model coefficients. The x-axis units represent standard deviations from the mean HDI.}
+\end{figure}
+
+
+\subsection{Hypothesis 2: Global context and mobile devices}
+
+%We proposed \textbf{H2:} that in the device gap between mobile and desktop devices such that desktop devices are superior tools for gaining an in-depth understanding that the reading time gap between lower-HDI and greater-HDI readers would be greater in magnitude on desktop devices. 
+
+We also find support for \textbf{H2}: that readers in the Global North ({${\displaystyle \mathrm {B} =15,~SE=0.002}$}) or higher-HDI ({${\displaystyle \mathrm {B} =0.11,~SE=0.002}$}) countries are likely to spend even less time reading compared to Global South or lower-HDI readers when they are on a desktop device compared to a mobile device. This is clearly visible as a differences in slopes in \hyperref[fig:model1aplot]{figure \ref*{fig:model1aplot}}. Indeed, for pages views other than the last-in-session, the predicted reading times for prototypical readers in countries 1 standard deviation below the mean decreases from 25 seconds on desktop devices to 22 seconds on mobile devices, but the reverse is true for readers in higher-HDI countries. In a country 1 standard deviation above the mean, an otherwise comparable reader is predicted to read for about 19 seconds on mobile and about 17 seconds on desktop.  The ME plot for \emph{model 1b} (\hyperref[fig:model1bplot]{figure \ref*{fig:model1bplot}}) shows that for the prototypical reader, the gap between Global South and Global North is greater on desktop devices (about 5 seconds) than on mobile devices (about 3 seconds).  
+
+%that geometric means of reading times on mobile devices and desktop devices are 22 and 26 seconds respectively, a difference of 4 seconds. However, this moderate gap grows to midni
+
+
+% \begin{figure}
+% \includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/13.png}
+% \caption{Marginal effects plot from Model 1 showing how the time spent on pages varies with type of device and on whether a page view is the last in a session.}
+% \end{figure}
+
+%Much of this variation seems to be driven by the least developed countries where people read Wikipedia.  In model 1b, where we label countries as Global South or Global North, readers in Global South countries appear to use mobile and Desktop devices in the same way on average, but readers in Global North countries spend more time reading on Mobile than on Desktop.
+
+
+% \begin{figure}
+%     \includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/14.png}
+%     \caption{Marginal effects plot showing how the time spent on pages depends on whether a reader is on the kind of device they are using, and the development level of the country they are in.}
+% \end{figure}
+
+% \begin{figure}
+% \includegraphics[width=1.0\linewidth,height=6.5in,keepaspectratio]{../images/15.png}
+% \caption{Marginal effects plot showing how the time spent on pages depends on the kind of device they are using, and the development level of the country they are in.} \end{figure}
+
+\subsection{Hypothesis 3: Global context and last-in-session}
+
+Based on the "screen-and-glean" results by Liu et al, we expected in-depth reading to be most likely in the last page view in a session, and thus predicted \textbf{H3:} the difference in reading times between lower-HDI countries and higher-HDI countries will be amplified in the last page view in a session.  However, we do not find support for this hypothesis, which would have been indicated by a negative regression coefficient for the interaction term between development and last-in-session. Instead we find a positive coefficients for  \emph{HDI:Last in session} ({${\displaystyle \mathrm {B} =0.63,~SE=0.002}$}) in \emph{model 1a} and for  \emph{Global North:Last in session} ({${\displaystyle \mathrm {B} =0.08,~SE=0.002}$}) in \emph{model 1b}. 
+
+%Note that there are still sizable reading time gaps between Global South and Global North readers and between devices when we look at last-in-session page views. These gaps appear larger in the marginal effects plots only because reading times are longer on average, and our model's logarithmic scale magnifies these differences, not because of the interaction term.  
+
+% for the R&R it would be really cool to make a better version of this that shows how the medians as a tree-flow diagram. 
+\subsection{Non-parametric Analysis}
+
+\hyperref[tab:nonparametric]{Table \ref*{tab:nonparametric}} shows the median time pages are visible by the user's economic region, device and whether a page is the last viewed in the user's session. Consistent with  \textbf{H1}, median users in the Global South spend more time on pages compared to median users in the Global North regardless of device or session stage. Consistent with  \textbf{H2}, the difference between Global South and Global North users is clearly more pronounced on desktop compared to mobile.  In contrast to the prediction of \textbf{H3}, but in line with the findings from our parametric analysis, we do not observe an accentuation of the difference between Global South and Global North users in the last page view in a session.
+
+\begin{small}
+\begin{table}[b]
+\centering
+\begin{tabular}[]{@{}llll@{}}
+\toprule
+Economic-region & Desktop & Last-in-session  & Time-visible\\
+\midrule
+ North &  False  & False & 20.1 \\
+ South &  False & False & 21.5 \\
+ North & True  &  False & 16.1 \\
+ South & True &   False & 21.8 \\
+ North & False & True & 28.1 \\
+ South & False & True & 28.7 \\
+ North & True & True & 39.8 \\
+ South & True & True & 43.6 \\
+\bottomrule
+\end{tabular}
+\caption{Table of median reading times by last-in-session, economic region, and device type. Reading times in the Global South are greater than in the Global North in all categories, and are markedly greater on desktop compared to mobile devices. \label{tab:nonparametric}}
+\end{table}
+\end{small}
+\subsection{Page length}
+
+In addition to the above results on reading times and global contexts, we also examined how reading times relate to page length. The association between page length and reading times is small and positive ({${\displaystyle \mathrm {B} =0.17,SE=0.0004}$}).
+%as shown by the marginal effects plot in figure \hyperref{fig:pagelenplot}.
+Pages on Wikipedia vary greatly in  length: from just a few bytes up to 2,000,000 bytes.
+%Our model estimates that the difference between the shortest and the longest page lengths can account for a difference in typical reading times from about 5 seconds to about 45 seconds. 
+If a page were to double its length, our model would predict a marginal increase in reading times of a factor of 1.2. For example, a page with 10000 bytes has a predicted reading time of 25 seconds, which for a page with twice that length (20000 bytes) increases to 30 seconds.\footnote{See our online supplement at \url{https://w.wiki/5Jo} for a marginal effects plot. Page length refers to the size of the wikitext source of the page measured in bytes. Not every byte corresponds to a character of readable text. Wikitext source also includes code for formatting, using templates, or embedding  images. Additionally, some characters, especially in non-Latin alphabets, may take up multiple bytes. Still our results confirm that for longer Wikipedia articles, only a fraction of the text is read in a typical page view. Assuming a reading speed of around 250 words per minute and an average word length of 5 characters in English (not including spaces and punctuation), these 30 seconds would only suffice to read through less than 1000 of these 20000 bytes \citep{bell_extensive_2001, bochkarev_average_2012}.}
+
+\section{Limitations}
+Two important technical limitations of our dwell time data affect our ability to compare reader behavior between mobile phone and PC devices. The first is missing data on mobile devices, discussed above. This missing data likely introduces a negative bias to our measures of reading time on mobile devices because we believe observations are more likely to be lost when users switch tasks from the browser, and subsequently return to reading. This bias may be quite significant as the issue affects a large proportion of our sample. 
+
+%Therefore, we do not believe that user behaviors that may generate the appearance of long reading times do not correspond to reading.
+%An additional limitation arises from the missing data described above. It is possible that we are missing data in ways that may potentially confound our results, especially, but not exclusively, in terms of the comparison between mobile and non-mobile devices. 
+
+%(the  lunch break problem ). 
+The second limitation occurs when readers leave a page visible in the browser at times when they are not directly reading it. For example, a user may have multiple windows visible while only looking at one of them, or may leave a browser window visible and move away from the computer for a long period of time. In general, the best we can hope to observe is that a page is visible in a browser. We cannot, through this instrument alone, know with confidence that an individual is reading.  This limitation leads to positive bias in our measures of reading time. To partially address this limitation, we fit regression models on data with dwell times greater than 1 hour removed (assuming that it contains a higher ratio of those "visible but not reading" cases), and found that our results were not substantively affected by the change.
+
+
+%We think that, especially combined with the first limitation described above (mobile devices failing to log a "page unloaded" event), this issue is likely to affect the data from desktop devices more often, thus limiting our ability to compare mobile phone and PC devices. 
+
+It is possible that this positive bias may correlate with our analytic variables. Perhaps last-in-session views may be particularly subject to this source of bias and may contribute to the gap we observe between reading times in last-in-session page views compared to others. We designed our analysis of H1 and H2 to account for differences between last-in-session and other page views, and found that the sign of the observed differences remained the same whether the view was the last in a session or not. We did not find support for H3, which considered differences within last-in-session page views.
+
+%Conclusions based on H3 would be more likely to be affected by this possible issue, but the data did not support H3 anyway. 
+
+%aIn our patametric analysis  by interacting \textit{last-in-session} with our analytic variables for these hypotheses (\textit{mobile}, \textit{HDI}, and \textit{GlobalNorth}). 
+%We don't see an a priori reason why such a reading pause would occur more often at specific points of a reading session. (Note that because of the way our instrumentation works, the user would normally need to become active again for the last view of a session to conclude and register a dwell time.) However, we can't completely rule out this possibility either based on our data alone.
+
+Additional steps could be taken to construct new measures of reading that would not suffer this limitation through browser instrumentation to track mouse movements or scroll positions. However, such steps should be taken with care as additional data collection may negatively affect users in terms of privacy, browser responsiveness, page load times, and power consumption.
+
+Finally, readers should keep in mind that we analyzed observational, not experimental, data with the intention to describe correlations between our variables, not to demonstrate causal relationships. We used ordinary least squares analysis, but future analysis might better account for the hierarchical structure of our data using multilevel modeling.  
+
+\subsection{Alternative explanations}
+
+% could be good to cite something else on knowledge gap there
+Furthermore, there are several plausible alternative explanations that we cannot rule out in the presented analysis.  The observed reading time gap between more and less developed countries may be due to factors other than the types of information seeking tasks in which readers are engaged. For instance, if readers experience knowledge gaps in less developed countries, they may be likely to read in languages that are not their primary language, and thus spend more time reading regardless of task \citep{graham_uneven_2014}.  A future iteration of this project may partially address this limitation by accounting for whether a Wikipedia edition is a common primary language in the reader's country.  
+
+Another alternative explanation may be that the gap between readers in more and less developed countries is partly due to time spent on exploration (``screening'') rather than on content consumption (``gleaning''). Our finding rejecting \textbf{H3}, 
+%that the last page view in a session is not associated with an amplified gap between Global South and Global North readers 
+suggests this, as Global South readers have longer dwell times on non-last-in-session page views compared to Global North readers. 
+We also observe shorter non-last-in-session page views on desktop devices compared to mobile for Global North readers, but for Global South readers such page views are about the same length no matter what device is used. This unexpected result would be consistent with a skills gap experienced by Global South readers who may have greater difficulty finding sought information, especially when using desktop devices \citep{deursen_toward_2015}. The present analysis offers only tentative support for this claim,  but we suggest it as an avenue for future research.
+
+% propagate the changes in the below paragraph to the report
+Global South readers may also be more sensitive to the price of downloading data and thus they may avoid opening pages that they are unlikely to read in-depth. Future work might use data from the Wikipedia Zero project to study the relationship between price sensitivity and Wikipedia audiences. More generally, drawing conclusions about information seeking from our analysis rests on strong assumptions about relationships between task type and reading times. Future work on information seeking behavior on Wikipedia testing these assumptions would help validate such conclusions. 
+
+
+% \begin{figure}
+% \centering
+% <<pagelen, echo=F,message=F,warning=F,fig.width=5,fig.height=3,dev='pdf',out.width='\\columnwidth' >>=
+% p <- ggplot(pagelen.plot.data,aes(x=x,y=predicted,ymax=conf.high,ymin=conf.low))
+% p <- p + geom_ribbon(alpha=0.3) + geom_line()
+% p <- p + ylab("Time visible (seconds)")
+% p <- p + xlab("Page length (log bytes)")
+% p <- p + theme(legend.position = "none")
+% p
+% @
+% \caption{Marginal effects plot showing how the time spent on pages depends on page length according to Model 1a. \label{fig:pagelenplot}}
+% \end{figure}
+
+
+\section{Discussion and Conclusion}
+
+%\subsection{}
+
+In an analysis of novel data from Wikipedia, measuring the time that web pages are visible in the browser window as an approximation of reading time, we investigated patterns of reader behavior across global contexts and found systematic differences consistent with greater use for in-depth understanding in lower-HDI countries compared to higher-HDI countries. We believe this analysis should strengthen confidence in similar findings from surveys of reader behavior because our data have complementary strengths and limitations compared to self-report data.  
+
+We conclude that Global South readers are more likely to engage in in-depth information seeking when reading Wikipedia compared to Global North readers. Consistent with Lemmerich et al.'s survey results \citep{lemmerich_why_2019}, we find that readers in lower-HDI countries have longer reading times than readers in higher-HDI countries, and that this difference is greater for users of non-mobile (desktop) devices.  
+
+The observed relationships are quite similar whether measured using the human development index (HDI) or dichotomized economic region (Global South / Global North). These relationships are supported not only by the regression models, but also by non-parametric analysis. 
+While Wikipedia readers increasingly use mobile devices to visit Wikipedia, they are likely to spend the most time reading when they are in the last page view of a desktop session. This is exactly when we expect them to gain in-depth understandings of topics.
+
+We lack evidence to fully explain our findings in terms of structural and socioeconomic differences between the Global North and Global South.  One possibility is that the gap in reading times reflects differences in information seeking and content understanding skills \citep{deursen_toward_2015, shaw_pipeline_2018}. That we did not observe the gap between global contexts widen in last-in-session page views tentatively suggests that Global South readers are more likely to struggle to find and filter information on Wikipedia compared to Global North readers. 
+
+However, given the evidence that Wikipedia readers in the Global South are more likely to engage in deeper information seeking tasks \citep{lemmerich_why_2019}, we conjecture that the gap in reading times may be explained by the quality and accessibility of the information on Wikipedia relative to alternatives available in the reader's contexts. Wikipedia may not be perfect, but given historical inequalities in education, and knowledge production between the Global South and Global North \citep{graham_uneven_2014}, it still might be competitive compared to other sources, especially when it comes to encyclopedic content about the Global South, content in local languages, and information not otherwise available for free to Internet users.  This would explain why Global South readers would be more likely to choose Wikipedia when seeking in-depth information. Future research might test this hypotheses in audience surveys or by adapting approaches previously applied to gender comparisons on English Wikipedia \citep{reagle_gender_2011}. 
+
+
+%We considered While we However, we did not find our predicted relationship between reading behavior in the last page view in a session and the association between  
+
+%device use is associated with an amplified gap between Global South and Global North readers supports the idea that
+
+%All of these results are consistent with the proposition that readers in the Global South are more likely to engage in deep information seeking tasks compared to readers in the Global North. 
+
+%Our analysis reading time is generally consistent with findings from the survey study, which suggested that readers in Global South countries are most likely to engage in more intensive information seeking tasks. 
+
+%We also considered whether the relationship would be amplified in the last view in a browser session, which we expect to be associated with content consumption as opposed to discovery.  While we do observe that all readers dwell for longer in the last page view in a session, and that readers in developing countries appear to read longer, we do not observe the gap between readers in less developed and more developed countries amplified in the last view in the session. 
+
+Another contribution of this study is to vet the reading time data to understand its limitations and to conduct model selection to justify parametric assumptions for future analysts. We found a high rate of missing data on mobile, among other less significant irregularities. Future analysts should keep this in mind and work to improve the coverage. We found that the log-normal distribution often fits the data well, and therefore adopted the use of geometric means as a metric for comparing samples reading times. This also helped support our decision to adopt ordinary least squares regression analysis for multivariate comparison. However, we also found that exponentiated Weibull and Lomax probability models were often an even better fit.  Future researchers might explore how reader behavior may generate data in processes consistent with these models.
+
+%To further complement the approaches to studying Wikipedia audiences described here, we suggest that future researchers conduct field studies of Wikipedia readers. Direct, in-person observation of Wikipedia readers can help us understand how people use Wikipedia in much greater detail than surveys and browser instrumentation can allow.  
+
+The reading time data we used in this study is a promising tool for future researchers to improve upon studies of page views for understanding Wikipedia's audiences.  For example, recent research has shown widespread misalignment between how often articles are visited and the quality of those articles \citep{warncke-wang_misalignment_2015}.  However, we have observed that not all views are created equal. Future studies on the relationship between content production and content consumption on Wikipedia might use reading time data to learn about how content consumption might change depending on article quality. 
+
+
+%One anticipated application of reading time data is for evaluating design interventions intended to improve the user experience of Wikipedia visitors. We recommend that analysts and designers use geometric means as a metric for comparing reading behavior between treatments or between sites. The distribution of reading times is very skewed and therefore the arithmetic mean can be misleading. Moreover, for most wikis, the  Log-normal distribution is a good fit to the data, and this justifies the use of geometric means. 
+
+\section*{Acknowledgements}
+We are grateful to the anonymous reviewers, whose observations helped improve the paper. Specials thanks to the web team at the Wikimedia Foundation that built the instrumentation, to Zareen Farooqui who conducted initial data quality vetting as part of her data analyst Outreachy internship at WMF, and to the Foundation's Analytics Engineering team for supporting the data analysis infrastructure used in this work.  Thanks to those who provided comments on various stages of this research, including Kaylea Champion, other members of the Community Data Science Collective, Johnathan Morgan, Aaron Halfaker, Isaac Johnson, Miriam Redi, Abbey Ripstra, and other members of the Wikimedia research team. Special thanks to Benjamin Mako Hill for his comments and advice. This work was completed while Nathan TeBlunthuis was a PhD student at the University of Washington and in his capacity as a Wikimedia Foundation contractor and affiliate. It was also supported by the National Science Foundation (GRFP-2016220885). 
+
+%\balance{}
+
+\setcounter{biburlnumpenalty}{9001}
+\printbibliography[title = {References}, heading=secbib]
+
+\clearpage
+\small
+\begin{table*}[htbp]
+\centering
+%\addto\captionsenglish{\renewcommand{\figurename}{Appendix:}}
+
+\caption{Regression tables for models 1a and 1b.} 
+\begin{tabular}{l c c }
+\hline
+ & Model 1a & Model 1b \\
+\hline
+Intercept                      & $1.3660 \; (0.0085)^{***}$  & $1.3791 \; (0.0085)^{***}$  \\
+Global North                   &                             & $-0.2680 \; (0.0022)^{***}$ \\
+mobile : Global North          &                             & $0.1490 \; (0.0024)^{***}$  \\
+mobile : Last in Session       & $-0.6332 \; (0.0021)^{***}$ & $-0.6349 \; (0.0021)^{***}$ \\
+Global North : Last in Session &                             & $0.0830 \; (0.0024)^{***}$  \\
+Human development index        & $-0.1961 \; (0.0018)^{***}$ &                             \\
+mobile : HDI                   & $0.1133 \; (0.0019)^{***}$  &                             \\
+HDI : Last in Session          & $0.0632 \; (0.0019)^{***}$  &                             \\
+Revision length (bytes)        & $0.1752 \; (0.0004)^{***}$  & $0.1758 \; (0.0004)^{***}$  \\
+time to first paint            & $-0.0164 \; (0.0006)^{***}$ & $-0.0171 \; (0.0006)^{***}$ \\
+time to dom interactive        & $0.0025 \; (0.0009)^{**}$   & $0.0024 \; (0.0009)^{**}$   \\
+mobilemobile                   & $-0.0118 \; (0.0023)^{***}$ & $-0.0142 \; (0.0023)^{***}$ \\
+sessionlength                  & $-0.0001 \; (0.0000)^{***}$ & $-0.0001 \; (0.0000)^{***}$ \\
+Last in session   & $0.8632 \; (0.0023)^{***}$  & $0.8575 \; (0.0023)^{***}$  \\
+nthinsession                   & $0.0002 \; (0.0000)^{***}$  & $0.0002 \; (0.0000)^{***}$  \\
+dayofweekMon                   & $0.0939 \; (0.0020)^{***}$  & $0.0926 \; (0.0020)^{***}$  \\
+dayofweekSat                   & $0.0169 \; (0.0020)^{***}$  & $0.0175 \; (0.0020)^{***}$  \\
+dayofweekSun                   & $0.0322 \; (0.0020)^{***}$  & $0.0332 \; (0.0020)^{***}$  \\
+dayofweekThu                   & $0.0561 \; (0.0019)^{***}$  & $0.0548 \; (0.0019)^{***}$  \\
+dayofweekTue                   & $0.0349 \; (0.0020)^{***}$  & $0.0326 \; (0.0020)^{***}$  \\
+dayofweekWed                   & $0.0757 \; (0.0019)^{***}$  & $0.0743 \; (0.0019)^{***}$  \\
+usermonth4                     & $0.0095 \; (0.0096)$        & $0.0083 \; (0.0096)$        \\
+usermonth5                     & $0.0108 \; (0.0095)$        & $0.0104 \; (0.0095)$        \\
+usermonth6                     & $-0.0102 \; (0.0097)$       & $-0.0103 \; (0.0097)$       \\
+usermonth7                     & $-0.0494 \; (0.0097)^{***}$ & $-0.0491 \; (0.0097)^{***}$ \\
+usermonth8                     & $-0.0119 \; (0.0097)$       & $-0.0121 \; (0.0097)$       \\
+usermonth9                     & $0.0382 \; (0.0076)^{***}$  & $0.0370 \; (0.0076)^{***}$  \\
+usermonth10                    & $-0.0004 \; (0.0075)$       & $0.0010 \; (0.0075)$        \\
+\hline
+R$^2$                          & 0.0721                      & 0.0725                      \\
+Adj. R$^2$                     & 0.0720                      & 0.0725                      \\
+Num. obs.                      & 9873641                     & 9873641                     \\
+RMSE                           & 14.2330                     & 14.2297                     \\
+\hline
+\multicolumn{3}{l}{\scriptsize{$^{***}p<0.001$, $^{**}p<0.01$, $^*p<0.05$}}
+\end{tabular}
+\end{table*}
--- a/dissertations/nathante_uw_2021/appendix_C_oresfairness.tex
+++ b/dissertations/nathante_uw_2021/appendix_C_oresfairness.tex
--- a/dissertations/nathante_uw_2021/articlequality.bib
+++ b/dissertations/nathante_uw_2021/articlequality.bib
--- a/dissertations/nathante_uw_2021/cdsc-memoir.sty
+++ b/dissertations/nathante_uw_2021/cdsc-memoir.sty
@ -0,0 +1,209 @@
+% Some article styles and page layout tweaks for the LaTeX Memoir
+% class.
+%
+% Copyright 2009-2018 Benjamin Mako Hill <mako@atdot.cc>
+% Copyright 2008-2009 Kieran Healy <kjhealy@soc.duke.edu>
+
+% Distributed as free software under the GNU GPL v3
+
+% This file was originally based on one by Kieran Healy
+% available here: http://github.com/kjhealy/latex-custom-kjh/
+
+%%% Custom styles for headers and footers
+%%% Basic 
+
+\makepagestyle{cdsc-page}
+%\makeevenfoot{cdsc-page}{\thepage}{}{}
+%\makeoddfoot{cdsc-page}{}{}{\thepage}
+%\makeheadrule{cdsc-page}{\textwidth}{\normalrulethickness}
+\newcommand{\@cdscmarks}{%
+  \let\@mkboth\markboth
+  \def\chaptermark##1{%
+    \markboth{%
+      \ifnum \c@secnumdepth >\m@ne
+        \if@mainmatter
+          \thechapter. \ %
+        \fi
+      \fi
+      ##1}{}}
+  \def\sectionmark##1{%
+    \markright{##1}}
+}
+\makepsmarks{cdsc-page}{\@cdscmarks}
+\makepsmarks{cdsc-page}{}
+\makeevenhead{cdsc-page}{}{}{\scshape\thepage}
+\makeoddhead{cdsc-page}{}{}{\scshape\thepage}
+
+%%% version control info in footers; requires vc package 
+% Make the style for vc-git revision control headers and footers
+\makepagestyle{cdsc-page-git}
+\newcommand{\@gitmarks}{%
+  \let\@mkboth\markboth
+  \def\chaptermark##1{%
+    \markboth{%
+      \ifnum \c@secnumdepth >\m@ne
+        \if@mainmatter
+          \thechapter. \ %
+        \fi
+      \fi
+      ##1}{}}
+  \def\sectionmark##1{%
+    \markright{##1}}
+}
+\makepsmarks{cdsc-page-git}{\@gitmarks}
+\makeevenhead{cdsc-page-git}{}{}{\scshape\thepage}
+\makeoddhead{cdsc-page-git}{}{}{\scshape\thepage}
+\makeevenfoot{cdsc-page-git}{}{\texttt{\footnotesize{\textcolor{BrickRed}{git revision \VCRevision\ on \VCDateTEX}}}}{}
+\makeoddfoot{cdsc-page-git}{}{\texttt{\footnotesize \textcolor{BrickRed}{git revision \VCRevision\ on \VCDateTEX}}}{}
+
+%%% print a datestamp from ShareLaTeX
+\makepagestyle{cdsc-page-overleaf}
+\newcommand{\@slmarks}{%
+  \let\@mkboth\markboth
+  \def\chaptermark##1{%
+    \markboth{%
+      \ifnum \c@secnumdepth >\m@ne
+        \if@mainmatter
+          \thechapter. \ %
+        \fi
+      \fi
+      ##1}{}}
+  \def\sectionmark##1{%
+    \markright{##1}}
+}
+\makepsmarks{cdsc-page-overleaf}{\@slmarks}
+\makeevenhead{cdsc-page-overleaf}{}{}{\scshape\thepage}
+\makeoddhead{cdsc-page-overleaf}{}{}{\scshape\thepage}
+\makeevenfoot{cdsc-page-overleaf}{}{\texttt{\footnotesize{\textcolor{BrickRed}{Buildstamp/Version:~\pdfdate}}}}{}
+\makeoddfoot{cdsc-page-overleaf}{}{\texttt{\footnotesize{\textcolor{BrickRed}{Buildstamp/Version:~\pdfdate}}}}{}
+
+%% Create a command to make a note at the top of the first page describing the
+%% publication status of the paper. 
+\newcommand{\published}[1]{% 
+   \gdef\puB{#1}} 
+   \newcommand{\puB}{} 
+   \renewcommand{\maketitlehooka}{% 
+       \par\noindent\footnotesize \puB} 
+
+\makepagestyle{cdsc-page-memo}
+\makeevenhead{cdsc-page-memo}{}{}{}
+\makeoddhead{cdsc-page-memo}{}{}{} 
+\makeevenfoot{cdsc-page-memo}{}{\scshape \thepage/\pageref{LastPage}}{}
+\makeoddfoot{cdsc-page-memo}{}{\scshape \thepage/\pageref{LastPage}}{}
+
+\usepackage{lastpage}
+\usepackage{datetime}
+
+% blank footnote
+% Use \symbolfootnote[0]{Footnote text} for a blank footnote. 
+% Useful for initial acknowledgment note.
+\long\def\symbolfootnote[#1]#2{\begingroup%
+\def\thefootnote{\fnsymbol{footnote}}\footnote[#1]{#2}\endgroup}
+
+% put a period after the section numbers
+\setsecnumformat{\csname the#1\endcsname.\enspace}
+
+% set fonts to garamond and helvetica
+\renewcommand{\rmdefault}{ugm}
+\renewcommand{\sfdefault}{phv}
+
+% material shared between the two modes
+
+\setsubsecheadstyle{\normalsize\itshape} 
+\setaftersubsubsecskip{-1em}
+\setsubsubsecheadstyle{\small\bfseries}
+\renewcommand{\printchaptername}{} 
+\renewcommand{\chapternamenum}{} 
+\renewcommand{\chapnumfont}{\chaptitlefont} 
+\renewcommand{\printchapternum}{\chapnumfont \thechapter\space} 
+\renewcommand{\afterchapternum}{} 
+\renewcommand{\printchaptername}{\secheadstyle}
+\renewcommand{\cftchapterfont}{\normalfont} 
+\renewcommand{\cftchapterpagefont}{\normalfont\scshape} 
+\renewcommand{\cftchapterpresnum}{\scshape} 
+\captiontitlefont{\small}
+
+% turn off chapter numbering
+% \counterwithout{section}{chapter}
+% \counterwithout{figure}{chapter}
+% \counterwithout{table}{chapter}
+ 
+% supress chapter numbers 
+% \maxsecnumdepth{chapter} 
+% \setsecnumdepth{chapter}
+
+% for numbered sections and subsections:
+% (a) comment out the above stanza; (b) uncomment the one below
+% \maxsecnumdepth{subsection} 
+% \setsecnumdepth{subsection}
+
+% set name of bibliography to 'references'
+\renewcommand{\bibname}{References}
+
+% >> cdsc-article <<
+\makechapterstyle{cdsc-article}{
+
+  % section heading sytle
+  \setsecheadstyle{\large\scshape} 
+ 
+ % reduce skip after section heading
+  \setaftersecskip{1.7ex}
+ 
+  % Title flush left
+  \pretitle{\flushleft\LARGE \itshape}
+  \posttitle{\par\vskip 0.5em}
+  \preauthor{\flushleft  \large \lineskip 1em}
+  \postauthor{\par\lineskip 1em}
+  \predate{\flushleft\footnotesize\vspace{0.65em}}
+  \postdate{\par\vskip 1em}
+ 
+  % 'abstract' title, bigger skip from title
+  \renewcommand{\abstractname}{Abstract:}
+  \renewcommand{\abstractnamefont}{\normalfont\small\bfseries}
+  \renewcommand{\abstracttextfont}{\normalfont\small}
+  \setlength{\absparindent}{0em}
+  \setlength{\abstitleskip}{-1.5em}
+  \abstractrunin
+
+  % this is the default page style for chapters
+  \pagestyle{cdsc-page}
+
+}
+
+% >> cdsc-memo <<
+\makechapterstyle{cdsc-memo}{
+
+  % section heading sytle
+  \setsecheadstyle{\large\sffamily\bfseries\MakeUppercase} 
+
+  % reduce skip after section heading
+  \setaftersecskip{1pt}
+  \setbeforesecskip{-1em}
+  \setaftersubsecskip{1pt}
+  \setbeforesubsecskip{-1em}
+  % \setaftersubsubsecskip{1pt}
+  % \setbeforesubsubsecskip{-1em}
+
+ 
+  % 'abstract' title, bigger skip from title
+  % \renewcommand{\maketitle}{\{\preauthor \theauthor\} \hfill \thetitle}
+  \renewcommand{\maketitle}{
+    {\Large\sffamily\bfseries\MakeUppercase\thetitle} \hfill
+    {\Large\sffamily\MakeUppercase\theauthor}
+    \vskip 0.7em}
+  \renewcommand{\abstractname}{\normalfont\scriptsize\noindent}
+  \renewcommand{\abstracttextfont}{\normalfont\scriptsize}
+  \abstractrunin
+
+  % set name of bibliography to 'references'
+  \renewcommand{\bibname}{References}
+  
+  \parindent 0pt
+
+  % this is the default page style for chapters
+  \pagestyle{cdsc-page-memo}
+
+}
+
+\endinput
+
--- a/dissertations/nathante_uw_2021/ch1_intro.bib
+++ b/dissertations/nathante_uw_2021/ch1_intro.bib
--- a/dissertations/nathante_uw_2021/ch1_intro.tex
+++ b/dissertations/nathante_uw_2021/ch1_intro.tex
@ -0,0 +1,282 @@
+% \maketitle
+
+Would Wikipedia be one of the most visited websites in the world if other online collaborative encyclopedia projects had been more established when it was founded? Or was Wikipedia helped by the fact that its predecessors had engaged and trained hundreds of its future contributors? Do new discussion communities on Reddit compete with existing communities for contributors? Is the evolving world of online communities better understood as a competitive struggle for resources or as symbiotic relationships that support a web of interdependent communities?   
+How does the environment of existing online communities shape the growth, performance, and impact of new groups?
+
+Answering these questions requires an \textit{ecological understanding} of online communities that accounts for the complex dynamic interactions between communities and their environments. 
+Prior studies of the growth, survival, and success of online communities have focused almost exclusively on communities' internal features \citep{kraut_building_2012} and have largely neglected environmental factors \citep[e.g.,][]{halfaker_rise_2013, kraut_building_2012, schweik_internet_2012, shaw_laboratories_2014, teblunthuis_revisiting_2018}. 
+Analyses from this ``focal organization perspective''  \citep{hannan_organizational_1989}    typically account for only a small amount of variation in communities' growth, longevity, and performance. Ecology provides a compelling alternative theoretical approach. In biology and organization studies, ecological approaches have shown that success is largely---and sometimes overwhelming---a function of what others groups are doing \citep{hannan_organizational_1989, worster_natures_1994}.
+
+Ecology is a scientific approach to understanding how interdependence between individuals, collectives, and environments shapes the world \citep{worster_natures_1994}.
+%Ecology grew from roots in 18\textsuperscript{th} century naturalism into a science of interrelationships between organisms, between species, and between organisms and the environment \citeh{worster_natures_1994}. 
+Although first developed to understand biological ecosystems, ecology's theories and methods influenced the development of human ecology, and later of organizational ecology \citep{hannan_organizational_1989, mcpherson_ecology_1983, park_human_1936}. Organizational ecology is a vast field in social science that explains the success, failure, and evolution of newspapers, microbreweries, social movements, and voluntary organizations \citep{carroll_concentration_1985, carroll_why_2000, mcpherson_ecology_1983, soule_competition_2008}. 
+Ecology can provide practical solutions to problems in complex systems like effective wildlife management, pest control, and sustainable utilization of renewable resources. In organization science, it provides compelling explanations for industrial life-cycles, organizational specialization, and patterns of collaborative partnerships.
+
+Recent research in the social computing on interdependence between online communities suggests that ecological analyses can provide not only novel scientific understandings but also viable community management strategies \citep{chandrasekharan_you_2017, kiene_managing_2018, tan_tracing_2018, teblunthuis_density_2017, wang_impact_2013, vincent_examining_2018, zhu_impact_2014}.  For example \citet{chandrasekharan_you_2017} found evidence that banning hateful communities on Reddit decreased hate speech in related communities.
+Community outcomes such as growth and survival depend on membership overlaps between communities \citep{wang_impact_2013, zhu_impact_2014}, but the nature of the resulting relationships remains unclear. \citet{wang_impact_2013} found that participant overlaps between Usenet groups were associated with \emph{competition} and decreased participation in both communities. However, \citet{zhu_impact_2014} found evidence that membership overlap between wikis is associated with \emph{mutualism} and benefits for both communities. Such contradictory findings point to the need for deeper, more precise theories of how ecological dynamics play out in online communities. 
+% In this sense, ecological approaches sit at the edge of established knowledge in field of social computing.  
+% That said, bare novelty itself is not a compelling motivation for a research program.
+
+
+Online communities are a dynamic, growing, and increasingly important form of organization that enable collaboration on public goods in contrast to the private goods production most studied in organizational ecology \citep{benkler_peer_2015}. 
+Through peer production, the Wikipedia community has produced the largest collaborative effort and most important reference work in human history.
+Free/libre open source software (FLOSS) communities have produced tens of billions of dollars worth of software made freely available online \citep{benkler_peer_2015}. 
+Other online communities like subreddits provide information, social support, and entertainment to millions of people.
+Ecological research into online communities may enable us to understand \emph{why} and \emph{how}  of the millions of attempts to build communities, only a tiny percentage of manage to mobilize participants and to sustain collaboration \citep{schweik_internet_2012, hill_studying_2019, shirky_here_2008}.
+However, online communities are vastly different from the organizations organizational ecology was developed to study.
+Classical hypotheses in organizational ecology are built on a system of interlinked assumptions that were informed by background knowledge of 20\textsuperscript{th} century organizations.  
+I argue that past applications of organizational ecology to online communities have not anticipated how this change in context could lead to changes in theoretical predictions. 
+
+%Much less isi undertsoo
+Therefore, I do not pick up organizational ecology as an authoritative model or set of laws capable of explaining the growth and decline of online communities. 
+Instead I drew some ideas directly from mathematical ecology, a subfield of applied mathematics, to better understand the foundational assumptions of an ecological perspective.
+On this foundation, I see this project as building an empirical basis for an ecological theory of online communities that starts by inferring competitive and mutualistic relationships between online communities. 
+% Studying these relationships is the place to start both methodologically and conceptually. 
+Once ecological dynamics between communities are demonstrated to have measurable relationships with the growth and performance of online communities, we can more fully explain their origins and consequences. 
+
+My empirical studies are framed in terms of how the ecological approach provides new insights and ways of studying interdependent online communities.
+However, these studies' methodological designs and empirical results also contribute to organizational ecology by expanding its application beyond the scope of its founding assumptions.
+The developers of organizational ecology developed strong intuitions about when organizations will complement or compete with one another based on claims from prior organization theory including that organizations compete over resources, are shaped into established forms by homogenizing pressures, are defined by strong boundaries, and lack capacities for rational adaptation.
+Because they typically lacked sufficient longitudinal data to infer when organizations are competitors or mutualists, they have rarely tested these assumptions directly, but rather test theoretical predictions about outcomes like organizational formation, survival, and change \citep{hannan_organizational_1989, baum_ecological_2006}. 
+The literature on interdependence between online communities is relatively young and provides less background knowledge that can inform such assumptions, but data from online communities enables a stronger empirical basis for understanding relationships between groups.
+
+Although the time series models in Chapters 2 and 4 depend on fewer assumptions about when competition or mutualism occur compared to the most influential frameworks of organizational ecology, Chapter 2's key finding, that mutualism is more common than competition among online groups with highly overlapping users, radically departs from organizational ecology which has found that both firms and voluntary organizations with highly overlapping resources typically compete \citep{mcpherson_ecology_1983, hannan_organizational_1989}.  Although I initially planned to continue developing model-based approaches to explaining the performance of online communities, to find widespread mutualism was surprising and demanded qualitative validation and explanation in terms of the experiences of online community members.  Therefore, Chapter 3 reports on an interview-based study of members of highly overlapping online communities. It concludes that ``no community can do everything'' because groups of overlapping communities are characterized by high degrees of specialization. Each community seems to provide a different set of benefits. As Chapter 3 discusses, this is consistent with ecological theory which suggests that highly specialized groups with overlapping memberships are unlikely to compete and that groups provide complementary benefits that can ``spill over'' and drive mutualistic dynamics. 
+
+Knowledge from interviewees includes invaluable cases of mutualism, grounded descriptions of relationships between overlapping online communities, a strong sense that Chapter 2's models are right about the ubiquity of mutualism, and clues about the importance of specialization.    However, the interviewees did not provide much to explain processes by which systems of specialized mutualistic overlapping communities develop. In Chapter 4, I draw from strands of organizational ecology that use evolutionary theory as a foundation for processes of change.  In addition, the models from Chapter 2 are effectively the most simple time series models that might be used to infer ecological interactions.  They depend on many assumptions that are probably unrealistic in the setting of online communities.  Therefore, Chapter 4 adopts non-linear time series models developed by mathematical ecologists to study nonlinear dynamics.  These models are important to Chapter 4's study design for investigating change processes and also compel us to conceptualize competition and mutualism interactions that are not static and fixed, but that vary over time. 
+
+% This doesn't quite work yet. Need to develop the insights for organizational ecology.
+In sum, online communities are a kind of organization, at least in the sense that organizations are ``constructed as tools for specific kinds of collective action'' \citep{hannan_organizational_1989}. Even when online communities are constructed to facilitate communication with strangers on the internet about a topic, this facilitation depends on the sustained contributions of members to keep the conversation going and structures for regulate behavior to maintain a suitable conversation space \citep{kraut_building_2012}. Online communities bear other similarities to organizations including their use of formalized roles, rules, and procedures and their use of boundaries defining the scope of activity \citep{foote_formation_2019}.
+
+That said, online communities are distinctive in that they are public-good producing voluntary groups constructed through computer-mediated communication. 
+Features of online communities depart in important ways from the types of organizations that classical organizational ecology has studied the most.  Online communities (1) are dependent on volunteer participation, (2) allow participation at very low levels of granularity (3) are weakly bounded and (4) face different potential sources of inertia.
+The remainder of this chapter discusses the methodological and theoretical implications of these interrelated features for ecological analysis.
+% Some of these features are often said to be made possible by the digital media through which online communities are constructed. 
+% Drawing on ecological theory and findings from the empirical chapters, I will suggest a role for ecological dynamics in the processes that give rise to and stabilize these structures.
+
+%  
+
+% Online communities (1) produce public rather than private goods and (2) online communities are online.  
+\section{Online Communities as Voluntary Organizations}
+
+Ecological theories conceive of dynamics among individuals that share resources needed for production and survival to explain change in the size and composition of groups over time.
+Organizational ecology explains macro-level social change in economies and industries through in an ``evolutionary'' style through mechanisms of the selection and adaptation of firms in a changing resource environment \citep{ven_explaining_1995, hannan_organizational_1989}. 
+Each organization's survival depends on its \emph{niche} in the resource environment. The notion of a niche is central, if sometimes slippery, and aims to capture the position of an organization in an abstract, high-dimensional resource space \citep{hannan_organizational_1989}.
+Organizational ecology was developed mainly to study commercial firms whose survival ultimately depends on their potential to offer returns on investment.
+Profitability of these firms typically hinged on expansion to control greater quantities of resources, provide economies of scale and create the potential for monopolistic rents \citep{hannan_organizational_1989}.  
+Niches for such organizations are often defined in terms of established categories of organizational forms \citep{carroll_why_2000}, technological production factors \citep{dobrev_dynamics_2001}, or economic outputs \citep{dobrev_shifting_2003}.
+
+How should we define niches for online communities?  
+They use the low-cost communication systems of the Internet to coordinate voluntary production of public information goods like encyclopedias, FLOSS programs, and cultural artifacts \citep{benkler_wealth_2006}.
+An online community might produce something damaging to the broader society, such as computer viruses or misinformation, but the types of online communities considered here produce public goods defined as \emph{non-excludible} (in principle, an individual cannot be excluded from utilizing them) and \emph{non-rival} (utilization does not diminish the good's value). 
+Therefore, the survival of online communities depends not on capacities to generate revenues and capture profits, but on the consistent participation of volunteer members who have heterogeneous motivations for contributing to a public good \citep{lampe_motivations_2010, shah_motivation_2006}.
+
+Dependence on volunteer members is something online communities have in common with voluntary organizations like social clubs, churches, or fraternal organizations \citep{bimber_collective_2012}. 
+Voluntary organizations have been studied in organizational ecology by J. Miller McPherson and collaborators who investigate overlapping niches defined by organizational members and associated demographic patterns \citep{mcpherson_evolution_1991, popielarz_edge_1995, mcpherson_ecology_1983, mcpherson_testing_1996}. 
+For example, \citet{popielarz_edge_1995} locate voluntary organizations' niches in ``Blau Space''  corresponding to the distribution of their members' demographic characteristics and  explain how voluntary organizations tended to become racially or educationally homogeneous in terms of competitive dynamics over members' time and attention \citep{popielarz_edge_1995}.   Similar to McPherson, ecological studies of online communities, including the present work, have defined niches of online communities in terms of their participants \citep{wang_impact_2012, zhu_impact_2014}. 
+
+However, membership is not the only plausible way to define an online community's niche. As a consequence of their nature as public-good producing voluntary organizations, their survival does not depend on expansion. Although influential models of the growth of online communities have assumed that motivations to participate in online communities increase as communities grow \citep{butler_membership_2001, kraut_building_2012}, recent surveys and interviews find that large and small communities provide different sorts of benefits \citep{hwang_why_2021, foote_starting_2017}. As Chapter 3 finds, larger communities provide steady streams of content and larger potential audiences, but are less capable of providing tight-knit socialization or specialized information.
+
+This kind of size-dependent specialization resembles ``niche-width'' arguments in organizational ecology.  For example, \citet{carroll_concentration_1985}  seeks to explain the coexistence of large and small organizations within an industry by proposing that generalists, who have wide niches, under-perform in certain areas of the resource space.
+Smaller organizations can exploit this under performance by specializing in these areas.
+However, as \citet{dobrev_dynamics_2001} argue, specialist organizations can grow large in certain circumstances and then organizational size can be uncorrelated with niche width. This is the case with online communities.
+For example the subreddit \texttt{r/prequelmemes} is dedicated to making and sharing memes only about the Star Wars and is the largest Star Wars related community on Reddit.
+Therefore, it important to recognize that memberships may not capture all the relevant dimensions of an online community's niche.
+Indeed, Chapter 3 finds at least three dimensions of specialization in terms of the benefits that members obtain from online communities.
+These are (1) access to the largest possible audience, (2) socialization in a homophilous community and (3) ability to find specialized content or information. 
+
+% TODO note on the using topic overlaps in Chapter 4.
+Still, for the purposes of the studies in Chapters 2 and 4, membership overlaps provide a number of advantages.  
+The benefits of participation may not be easily observed, so measuring online community niches in terms of participation, which is observable, is empirically tractable. 
+Furthermore, findings in Chapter 3 suggest that community leaders do not normally seek to appropriate private value from their communities. If so, then it seems more likely that ecological dynamics that shape the growth and survival of online groups will have more to do with participation, the main rival resource on which online communities depend.
+Finally, studies in organizational ecology have set out to test models that depend on  linear or curvilinear relationships between niche-overlap and competitive pressures and this required stronger assumptions around the measurement of niche width than those needed here \citep{carroll_concentration_1985, ,dobrev_shifting_2003}.
+Chapters 2 and 4 use membership overlaps to identify clusters of highly related communities while time-series models are used to infer competition and mutualism.  These models bear their own assumptions, but the threat to scientific validity moves from the task of measurement to the task of statistical inference.  Chapter 5 discusses how expanding definitions of an online community's niche to account for additional dimensions of specialization will be important for future work.
+
+\section{Openness Allows Dividing Time into Little Chunks}
+
+Although following McPherson's use of membership-based niches makes sense because online communities depend on voluntary contributions to produce public goods, a second key feature of online communities departs from the voluntary organizations in McPherson's studies. This is that online communities provide opportunities for ``tiny acts of participation'' like signing a petition, fixing a typo on Wikipedia, or ``liking'' a post.
+When individuals can act in small granular ways they can easily participate in many online communities in rapid succession \citep{benkler_wealth_2006, margetts_political_2015, tan_all_2015}. 
+% What are the implications of this change for ecological interactions among online communities?
+By contrast, McPherson assumes that organizations conduct their activities in face-to-face in-person meetings and theorizes that constraints of time and space strongly limit the number of organizations to which an individual can belong \citep{mcpherson_ecology_1983}.
+After work and other obligations, it seems unlikely that many people would have time to belong to very many voluntary organizations at once, so participation in an organization is highly rival and overlaps in membership are tightly coupled with competition.
+
+Chapter 2 and prior ecological studies of online community participation follow this intuition by considering membership to be a rival resource, and assuming that online communities with overlapping users are those likely to have significant ecological interactions   \citep{butler_membership_2001,wang_impact_2012}. However Chapter 2 avoids assuming that these interactions will be competitive and instead finds that mutualism among highly related online communities is about 4 times as common as competition and in Chapter 3 interviewees described how these related communities have specialized roles. Together, these findings suggest that the growth and survival of a sufficiently established community is not often limited by competition over membership.
+Why is membership overlap so strongly associated with competition in the context of in-person voluntary organizations but highly overlapping online communities are often mutualists? 
+
+% On the other hand, online groups also rely on \textit{nonrival} resources. They can even produce connective and communal public goods like opportunities to communicate or collections of information \cite{fulk_connective_1996} which can be ``antirival'' when their usefulness increases as a result of others using them \cite{kubiszewski_production_2010, weber_political_2000}. For example, the usefulness of a communication network increases as more people join it \cite{fulk_connective_1996, katz_network_1985}. Similarly, the usefulness of an information good can increase as more people come to know, refer to, and depend upon it \cite{kubiszewski_production_2010, weber_political_2000}.
+% % as when 
+% %Awareness that an online group provides an audience can motivate participation  \cite{zhang_group_2011}. 
+% If multiple online groups help build the same connective or communal public goods, they may form mutualistic interactions where contributions to one group may ``spill over'' and motivate participation in mutualist groups \cite{zhu_impact_2014}.   
+% Ecological approaches seek to understand how different types of resources will limit or promote growth.
+
+% TODO cite aaron swartz
+
+Online communities ``transcend time and space'' using asynchronous and low-cost telecommunications \citep{jarvenpaa_communication_1998, peters_speaking_1999}.  Although individuals are fundamentally constrained in their available time and energy, they can finely divide their time over many communities. \citet{margetts_political_2015} suggest this less ``lumpy'' form of participation helps enable online collective action.  
+Similarly, the fine-grained division of individuals' activities across communities is closely related to the success of online communities having ``open'' organizations with minimal barriers to participation. \citeauthor{benkler_wealth_2006} claims that the fact that information is non-rival is central to how online communities successfully peer-produce public information goods.  This characteristic of information goods also enables open organizational structures so that peer-production projects can incorporate contributions from peripheral contributors \citep{benkler_wealth_2006, bryant_becoming_2005}. 
+Together, these factors allow levels of participation that are even more unequal than those found in other voluntary organizations. For example, while ``the top 20\% of volunteering individuals contributed 50\% of the time volunteered in the USA'' in 2016, the top 1\% of Wikipedia editors put in 77\% of the effort into editing Wikipedia \citep{matei_structural_2017}.  
+% Such inequalities in the degrees of participation in an online community have often been conceptualized as a division between ``core'' and ``peripheral'' members. 
+
+% TODO add citations
+When people can spread their time across many open communities, this also shapes the nature of membership in a community and the boundaries between communities.
+Organizational ecology was developed with the relatively impermeable boundaries of commercial organizations in mind \citep{hannan_organizational_1989}. 
+This is a second reason why \citeauthor{mcpherson_ecology_1983}'s studies of voluntary organizations provide a good model for studying ecology of online communities. 
+While commercial firms have relatively strong boundaries around internal activities and control over much of their employees time, voluntary organizations open up more of their activities to outsiders in order to attract participants.
+As noted above, \citeauthor{mcpherson_ecology_1983} assumes that voluntary organizations with overlapping niches will compete.
+However,  mathematical ecology shows that niche overlaps do not necessarily imply competition in complex systems involving multiple organizations or resource dimensions because factors other than the overlapping resources can limit growth \citep{armstrong_competitive_1980}. 
+%  and more often because of internal limitations of the community's ability to provide benefits to its membership.
+
+
+% TODO cite some stuff about 
+Finally, by modeling community size as the ``tiny act of participation'' of commenting in a given week, the analysis of ecological dynamics in Chapters 2 and 4 might be predisposed to find mutualism. Although quantifying time spent on contributions might not be possible in the case of Reddit (how would we count the time someone spends creating art to share with an online community?), it is possible that a study of participation intensity might find weaker mutualism and stronger competition if small contributions from  peripheral members are less rival than larger contributions from core members.
+On the other hand, if these contributions take the form of non-rival information goods, then communities will be unlikely to compete over them (an artist is likely to share their effortful creations with all communities from which they desire an audience).    
+The findings of Chapters 2 and 3 both suggest that part of why subreddits with overlapping memberships can provide complementary benefits and form mutualistic ecological relationships is that membership in multiple online communities is relatively inexpensive. If subreddits became closed organizations, perhaps by introducing pricey membership fees, one would expect stronger competition over membership. In this way, openness appears to provide conditions less conducive to competition and more conducive to mutualism.
+
+% Stuff about organizational boundaries here
+
+\section{How Should Online Communities be Divided into Organizational Forms?}
+
+Related to openness and the predominance of mutualism is Chapter 3's finding of extensive specialization among online communities that have similar topics and similar members. One rarely observes more than one active subreddit with similar topics that is not differentiated in some significant way, often in size, rules or topic.  Groups of related online communities thus depart from the organizational forms studied in organizational ecology in ways that trouble the specific strands of organization ecology used by prior research on online communities. 
+
+% As described in Chapter 2, early studies of competition and mutualism online communities adopted density dependence theory, perhaps because it is the most influential theory in organizational ecology.  Population ecology is a set of theories and models for analyzing competition and mutualism among a set of groups that are assumed to be very similar to one another. Community ecology on the other hand studies relationships between groups without assuming they are similar. 
+
+Chapter 2 defines its approach as community ecology because it focuses on relationships between different online communities.
+This may surprise readers of the organizational ecology literature in sociology which defines community ecology as the study interactions between populations of organizations, but I argue it is reasonable given the heterogeneity of overlapping online communities.  
+I will also note that studies in Communication have applied the community ecology approach to study competition and mutualism between telecommunication companies \citep{dimmick_theory_1984, barnett_competition_1987} or networks of organizational relationships \citep{dimmick_theory_1984, margolin_normative_2012}.  However, such studies are a small minority in the literature. 
+
+\citet{aldrich_organizations_2006}, \citet{hannan_organizational_1989}, and \citet{astley_two_1985} all consider community ecology as having a distinct level of analysis from population ecology.
+They use levels of abstraction analogous to those used in biological ecology where a population is set of individual organisms of the same species and a community is a set of interacting populations.  
+For these organizational ecologists, a population is a set of organizations having the same \emph{organizational form} and a community corresponds to an \emph{organizational field} of related organizational forms. 
+
+The identification of an organizational form is of central importance. 
+Both organizational and mathematical ecologists are aware that population ecology models like density dependence depend on the assumption that the population under study is homogeneous in the sense all members of the population are equally subject to the same intra-population mutualistic and competitive forces.
+Organizational ecologists have justified these assumptions by carefully demarcating different types of organizations into organizational forms theorizing that discrete boundaries around organizational forms are constructed by homogenizing features like efficient ways to bundle transactions \citep{williamson_economics_1981}, external regulatory frameworks, or other mechanisms of institutional isomorphism \citep{dimaggio_iron_1983,hannan_organizational_1989}.
+Still, the definition of organizational forms in organizational ecology often amounts to accepting an established categorization. The fascinating question of how the processes by which such categorizations are socially constructed are related to the ecological dynamics within and between organizational forms has driven much work by Hannan and his collaborators in recent years \citep{pontikes_ecology_2014, hannan_logics_2007, hannan_concepts_2019}. 
+
+Although McPherson's series of papers on the ecology of voluntary organizations may best be described as a community ecology analysis of categories of voluntary organizations like ``sports'' or ``youth serving'' organizations, at times he resists analogizing organizations as biological populations:
+\blockquote[\cite{mcpherson_ecology_1983}]{A population of organizations, then, is not a set of discrete creatures who must mate with each other to reproduce, but a froth of bubbles, constantly sharing or exchanging members, growing and dying, and being absorbed and segmented in response to changing conditions}.
+In this instance as well as others, McPherson's papers sometimes slip from discussing ecological dynamics among different organizational forms, which is measured in the data, and between different organizations, which is not. In the above quote, McPherson clearly has a dynamic ecosystem of differentiated organizations in mind. Perhaps the set of ``sports'' organizations contains too much heterogeneity to constitute an organizational form.
+
+Later organizational ecologists studied diversity within an organizational population by appealing to a distinction between  ``core'' features which define the organizational form and are mostly stable over time and ``peripheral'' features which are allowed to vary \citep{hannan_organizational_1989}. Organizational ecologists have studied how variation and specialization of peripheral features shapes competition within an organizational form. For example, \citet{dobrev_shifting_2003} studies how degrees of overlap among automotive firms' technological niches, measured as engine horsepower, changed over time and affected organizational survival. Similarly, Chapter 2 and prior ecological studies of online communities measure user overlap density to quantify how much a community's members participate in other communities \citep{zhu_impact_2014, zhu_selecting_2014, wang_impact_2012}.  Chapter 4 takes this a step further by studying how dynamically shifting niches are related to competitive and mutualistic interactions.
+
+Organizational forms of online communities might be defined according to the platform hosting them.
+Indeed, prior ecological studies of online communities have done exactly this and treated sets of communities sharing a platform like Usenet or Wikia as a population.
+However,  technological boundaries around platforms may not ensure sufficient homogeneity to justify treating these sets of communities as an organizational form.
+One finds enormous diversity in the topics and purposes of communities upon exploring a platform like Reddit, Facebook Groups, or Wikia.
+Chapter 3 finds that, even when topics and memberships are very similar, online communities are specialized in other dimensions. Although a platform clearly provides a set of common technological affordances, many platforms are flexible enough to allow a great deal of diversity in scopes, rules, and communities can greatly expand available affordances by using auxiliary technologies like bots \citep{kiene_technological_2019}.
+It is thus questionable that overlapping features of online communities like memberships or topics are ``peripheral'' while the use of a platform is ``core'' and therefore it is difficult to identify populations of online communities \emph{a priori}.
+
+% Although Chapter 2 is framed as introducing a novel community ecology approach to social computing that is complementary to population ecology, these theoretical arguments suggest that defining  may not be very useful when applied to online communities. 
+
+When categorizations of organizations of interest are not well-understood, \citet{hannan_organizational_1989} recommend using numerical clustering to find divisions of organizational forms.
+The quantitative analyses in Chapters 2 and 4 are all based on a clustering algorithm that groups subreddits with similar kinds of users.
+I define these as ``ecological communities'' in a way that is consistent with the sense of Aldrich and Reuf, although they  are interested in competition and mutualism between organizational forms. 
+However, as Chapter 3 demonstrates, this results not in clusters of online communities having similar forms, but in groups of subreddits whose topics are related but whose forms vary along dimensions of scope, size, and internal structures like rules.
+Population ecology is designed to study the mutualistic and competitive processes among members of an organizational form.
+Community ecology is designed to study mutualism and competition between populations of organizations having different forms.
+Neither theory seems to fit exactly with subreddits, but Chapter 2 can be understood as advancing a community ecology analysis of organizational forms assumed to have a single member organization. If this seems overly nuanced, one can simply adopt the framing of Chapter 2 and ignore matters of organizational forms and fields and treat community ecology as a relational framework and population ecology as an environmental framework.
+
+% as a study of ecological interactions within clusters of online communities. 
+
+% In contrast to Hannan and Freeman's approach, the inspiration for prior ecological research in online communities \citep{wang_impact_2012, zhu_impact_2014},  
+
+% describes a \emph{selection} process in which many online communities are created but fail to sustain participation if they do not find a sufficient niche.   
+
+\section{Inertia and Adaptation}
+
+Organizational ecologists have tended to emphasize selection processes because  organizational cores appear to change relatively little.  External homogenizing forces described and by internal factors like culture and routines that are difficult to change lead to ``structural inertia.''  Structural inertia limits an organization's ability to rationally adapt to a changing environment.   Organizations typically lack sufficient information about their environments and the ability to coordinate change with sufficient precision in order to rationally adapt, especially when it comes to change in the ``core'' aspects of an organization \citep{hannan_structural_1984}.  However, they also experience exceptional transformational periods that accompany an increased risk of failure \citep{aldrich_organizations_2006}.  
+If organizations are adaptive, then a teleological or functionalist explanation of organizational change may be better than an ecological one \citep{ven_explaining_1995} and theories of change in organizational fields should be based on Lamarkian adaptation-based evolution instead of Darwinian selection.
+
+Whether online communities can adapt has important consequences for design interventions aimed at improving the quality or safety of online spaces. Adaptive online communities may adopt new tools for moderation or quality control or implement policy changes to address newly uncovered problems. But online communities having substantial structural inertia will struggle to adapt, problems that go unaddressed will contribute to communities' declines, and solutions will largely emerge through the construction of new communities.  A selection-based change process may be slower than an adaptive one because it will be limited by rates of community formation and decline. 
+
+% Three possible types of explanations: leadership, membership composition, routinization! 
+Prior research into online communities suggests a relatively high degree of structural inertia, at least when it comes to policy \citep{teblunthuis_revisiting_2018, halfaker_rise_2013}, but the origins of this inertia are not obvious. 
+One explanation looks to the composition of contributors to an online community and sees social barriers to diverse newcomers as limiting capacities for change \citep{lam_wp:clubhouse?:_2011, tripodi_ms_2021,menking_people_2019}.
+Another explanation is the entrenchment of oligarchical leadership \citep{shaw_laboratories_2014}, who may be conservative and resist change.
+Yet in classical organizations, leaders often seek purposeful adaptation, but are foiled by internal sources of inertia like organizational cultural, internal patronage networks, conflicts among stakeholders, and routines \citep{hannan_structural_1984, ven_explaining_1995}.  Some of these inertial forces appear to have analogs in online communities such as the stability of emergent roles \citep{arazy_how_2017, arazy_functional_2015}, routines \citep{keegan_analyzing_2016}, and internal conflict that may stabilize policy \citep{shi_wisdom_2019}. 
+
+Chapter 4 explore the relationship between ecological dynamics and adaptive processes in online communities by relaxing assumptions of the model in Chapter 2 to allow ecological interactions between online communities vary over time. This allows us to explain that mutualism is more common than competition in Chapter 2 because periods of mutualistic interaction last longer than periods of competitive interaction. Finding that competitive and mutualistic dynamics in online communities are not static, but dynamic and vary over time sets up hypotheses tests about how online communities might adapt to avoid competition or increase mutualism. While I find evidence that communities increase their specialization by decreasing their user and topic overlaps in competitive conditions, I do not find that this decreases competition and increases mutualism.
+This suggests that variations in competitive and mutualistic dynamics are driven by exogenous events and that at least when it comes to positioning themselves with respect to one another, that successful online communities have ``selected an effective niche'' \citep{zhu_selecting_2014}.  
+As discussed further in Chapter 5, the evidence from Chapters 4 does not support strong claims about whether mutualism is common because of adaptation or selection. Future work should seek to demonstrate the selection process in action.  
+
+
+%Given that they vary over time, if online communities act rationally to position themselves relative to each other in ways that optimize their mutualism, we might find temporal correlations between a communities changes topic and membership overlap and its competitive and mutualistic relationships.
+
+%If not,
+
+\section{Conclusion: Contributions to Organizational Ecology}
+
+Organizational ecology began by asking ``Why are there so many kinds of organizations?'' \citep{hannan_organizational_1989, hannan_population_1977}. It provides a conceptual model of how people build systems of interdependent social structures within organizational fields, and a vast and rich literature that was initially developed to study firms in long-running commercial industries. Although \citet{hannan_organizational_1989} account for the demography of industrial unions in their theory, these unions had key characteristics in common with the firms including strong boundaries, pursuit of monopoly, and dependence on institutional legitimacy.  In general, they had their ideological and historical origins in the age of bureaucratic rationalism \citep{hannan_organizational_1989}.   Theories of organizational ecology have been widely applied to organizations in other contexts, most importantly voluntary organizations and social movements \citep{mcpherson_ecology_1983, soule_competition_2008, minkoff_interorganizational_1995, olzak_ecology_2001}.
+
+The best work of this kind meaningfully adapts organizational ecology to the new context.  For example, \citet{soule_competition_2008} link organizational ecology to the resource mobilization theory of social movement organizations. Such works use organizational ecology as a ``theory of the middle range'' that is empirically grounded but has sufficient generality to bridge across multiple domains. However, organizational ecology is not mature paradigm like thermodynamics where models can be treated as ``scientific laws'' and expected to make accurate predictions about new contexts without any conceptual modification \citep{kuhn_structure_1970}.
+As discussed above, some basic concepts of theory, like that of the organizational form, are difficult to apply to online communities.
+When virtually all organizations in an organizational field are highly distinctive and no established system for categorization can be found, the concept of ``organizational form'' breaks down and so may the usefulness of distinguishing between the ``population'' and ``community'' levels of analysis.   
+
+Despite these ontological concerns, as I argue in Chapter 2, density dependence theory's environmental perspective is still useful because the relationship between user overlap density and growth or survival seems to reflect the hospitality of an environment.
+However, one must keep in mind that tests of density dependence theory in online communities have provided evidence in the form of weak correlations derived from observational data.
+I suggest that a project to synthesizes foundational concepts from organizational ecology with new empirically supported ideas about the interdependence between online communities will be a more effective strategy.
+
+The most important empirical finding, that mutualism is widespread, is empirically supported by quantitative-qualitative triangulation. Using statistical methods, I have found that mutualism is much more common than competition among subreddits with highly overlapping users. Based on interviews with members of these subreddits, I have found that this widespread mutualism is consistent with their intuitions and I have surfaced a plausible explanation for it in how individuals seek multiple benefits from online communities and that communities with similar topics and overlapping users specialize in providing different types of benefits. 
+
+Online communities provide granular longitudinal data of individual behaviors in overlapping groups that make it possible to effectively model and test such propositions.
+Studies in organizational ecology have generally been limited to one organizational form or organizational field at a time.
+This has made it difficult to test hypotheses about the scope conditions for ecological dynamics or their consequences.
+The time series analysis strategies advanced in chapters 2 and 4 make it possible to study ecological interactions on much larger scale, and to justify statements about what kinds of relationships are typical and to model antecedents and consequences of these relationships.  
+It is important to recognize the limits of prior theories and quantitative tools. When results are puzzling or dead-ends are reached, talking to community members is likely to yield insights that open the way toward a solution. The project of this dissertation is to begin reconstructing organizational ecology in the relatively theory-poor but data-rich context of online communities. 
+
+% I reconstruct organizational ecology
+%  project.
+% infer a large number of competitive and mutualistic relationships groups instead of depending on an elaborate theoretical foundation.  
+% Chapter 2 uses this method to deconstruct theories like density dependence that were built upon assumptions of when organizations will be competitors or mutualists by inferring these relationships directly from the data. This begins the
+% This widespread mutualism among online communities with overlapping members radically contrasts with the competition found among offline voluntary organizations and follows from important ways that online communities differ from classical organizations.
+% The ``openness'' of online communities in conjunction with the use of digital media decrease the rivalrousness of membership, and therefore the potential for competition over members.
+% That online communities exist to provide public benefits to their members and audiences and provide different types of benefits at different sizes means that they do not in general seek to increase their sizes. 
+% Unlike commercial firms, online communities do not have strong incentives to compete with each other. 
+% Many reasons suggest that overlapping online communities will be mutualists and few reasons are apparent for why multiple communities providing equivalent benefits would exist and compete.
+% Yet, observing that mutualism is common does not explain the different roles of community founders, managers, and platform design in how systems of overlapping mutualistic online communities are organized.
+
+% Organizational ecology provides evolutionary modes of explanation for organizational change based on adaptation or selection processes \citep{ven_explaining_1995}.
+% Early organizational ecology made strong assumptions that organizational cores change relatively little because of ``structural inertia'' introduced by the external homogenizing forces described above and also by internal factors like culture and routines that are difficult to change \citep{hannan_structural_1984}.  
+% Most organizations typically have neither sufficient information about their environments nor the ability to coordinate change with sufficient precision in order to rationally adapt, especially when it comes to change in the ``core'' aspects of an organization \citep{hannan_organizational_1989}.
+% This model suggested that change in organizational forms was likely to be driven by organizational death and replacement instead of adaptation.
+
+% Paragaph below copied to chapter 4.
+% Online communities also appear to have significant inertia that may come from multiple causes discussed above, but it is also conceivable that mutualism can emerge through an adaptive process that their openness makes possible \citep{mcpherson_testing_1996}.
+% Suppose an individual chooses to participate in a community when they have the greatest expectation of finding a type of benefit. 
+% Through their participation, they can make the community a better place to find this type of benefit by contributing to the supply of resources their own content, attention, and efforts and by rewarding those who provide their benefits with thanks, votes and other signals of approval. 
+% When many individuals act in this way, their actions may collectively reinforce the ability of the community to provide the benefits in a process resembling the Schelling model of segregation \citep{schelling_micromotives_1978}.
+% When communities overlap, large degrees of specialization may emerge through such a feedback loop \citep{mcpherson_testing_1996}.
+
+% Chapter 4 deepens the exploration of ecological dynamics by relaxing assumptions of the model in Chapter 2 in order to find out how ecological interactions between online communities vary over time and the roles of adaptation and selection in changing ecological dynamics. I test the hypothesis that online communities can rationally adapt to avoid competition or increase mutualism through a time series analysis. The first step is to demonstrate that competitive and mutualistic dynamics in online communities are not static, but dynamic. They vary over time.  Therefore, if online communities act rationally to position themselves relative to each other in ways that optimize their mutualism, we might find temporal correlations between a communities changes topic and membership overlap and its competitive and mutualistic relationships.
+% However, I observe that changes in online community topics are not correlated with decreases in competition or increases in mutualism.
+% This suggests that the emergence of mutualism is driven not by adaptation, but by selection.
+% Chapter 5 discusses future directions to investigate the micro-level dynamics of this process and other open research questions. 
+  
+% % Cite exit and voice below
+% These findings lend additional support to the notion that changing online communities is difficult.  Creating new communities that provide complementary benefits may be an alternative solution when existing communities are lacking.  
+% However, the set of new benefits likely needs to be significantly different from the set of benefits provided by incumbent communities.
+% Although the results of Chapters 2 and 4 find mutualism is less common than competition, they also show that competition happens.
+% Moreover, they look at communities that have survived for long enough that their niches are measurable and therefore competition faced by the smallest communities that never take off is unobserved. 
+% If the greater prevalence of mutualism is driven by a selection process, this is likely because new communities that face competition are exceedingly unlikely to survive. 
+
+% As an example, consider attempts to reform Wikipedia to be more inclusive through changing sourcing and notability policy.
+% These attempts encounter strong structural inertia resulting from entrenched norms and policies and capacities of opponents to stonewall debate and block changes.
+% In this way, activists for a more inclusive Wikipedia have struggled to exercise voice.
+% Wikipedia's openness and creative commons licensing make it possible for other encyclopedias to reuse its content. 
+% Yet, should activists choose to exit and start an alternative to Wikipedia with different policies, this new project will be unlikely to replace Wikipedia if the differentiating factors are limited to different policies and better coverage in a few areas.
+% Still, those seeking a more inclusive knowledge production community or the specific types of knowledge it provides may find these benefits in a new specialized community.
+% However, Wikipedia will almost certainly continue to draw a larger audience and pool of contributors.
+
+% Organizational ecology's virtues stem from its defining conceptual move: to explain the success or survival of individual organizations in terms of their relationships with other organizations.
+% By adopting an intermediate level of analysis seeking to explain the largest rise of  analytically tractable 
+% with ambitions reaching far below explaining macro-historical changes like the rise of capitalism, organizational ecology  dynamics of large-scale social changes like the rise of newspapers or M-form organizational forms, without appealing to overarching macro-historical forces that are difficult to measure and may be necessarily undetermined given available evidence.
+
+
+% 
+% something about institutionalization?
+
+% We will address these shortcomings by first engaging deeply with ecological research in both biology and organization science from which we will borrow concepts and methods. In the context of online communities, we will define an ecological \emph{population} as the set of communities that share a set of \emph{resources}. In the context of online community research, resources include the labor and intellects of participants, content that they appropriate and produce, as well as the technological and social systems that communities develop to structure themselves like norms, rules, and technologies \cite{butler_membership_2001}. In an ecological model, a community must find a \emph{niche}---i.e., a set of resources that it can utilize comparatively better than other communities---in order to survive.  
+
+% Why study online communities from an ecological perspective? 
+
+% bibliography here
+% \setcounter{biburlnumpenalty}{9001}
+% \printbibliography[title = {References}, heading=secbib]
--- a/dissertations/nathante_uw_2021/ch2_identifying.tex
+++ b/dissertations/nathante_uw_2021/ch2_identifying.tex
@ -0,0 +1,933 @@
+%
+%% This is file `sample-authordraft.tex',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% samples.dtx  (with options: `authordraft')
+%% 
+%% IMPORTANT NOTICE:
+%% 
+%% For the copyright see the source file.
+%% 
+%% Any modified versions of this file must be renamed
+%% with new filenames distinct from sample-authordraft.tex.
+%% 
+%% For distribution of the original source see the terms
+%% for copying and modification in the file samples.dtx.
+%% 
+%% This generated file may be distributed as long as the
+%% original source files, as listed above, are part of the
+%% same distribution. (The sources need not necessarily be
+%% in the same archive or directory.)
+%%
+%% The first command in your LaTeX source must be the \documentclass command.
+% \documentclass[sigconf,authordraft]{acmart}
+
+
+%%%% As of March 2017, [siggraph] is no longer used. Please use sigconf (above) for SIGGRAPH conferences.
+
+%%%% As of May 2020, [sigchi] and [sigchi-a] are no longer used. Please use sigconf (above) for SIGCHI conferences.
+    
+%%%% Proceedings format for SIGPLAN conferences 
+% \documentclass[sigplan, anonymous, authordraft]{acmart}
+    
+%%%% Proceedings format for conferences using one-column small layout
+%\documentclass[acmsmall,authordraft]{acmart}
+    
+% NOTE that a single column version is required for submission and peer review. This can be done by changing the \doucmentclass[...]{acmart} in this template to 
+% \documentclass[sigconf,review=True]{acmart}
+\chapterprecishere{
+% Most explanations of changes in online group size focus on internal factors like social structures or design decisions. 
+% do not make the , and render critical questions like “which other groups are a given group's strongest competitors or mutualists?”  unanswerable.
+% TODO: Polish abstract
+% Online groups interact with each other as people, content and ideas flow among them. 
+We introduce a method for inferring competitive and mutualistic interactions between online groups from time series participation data based on the theoretical framework of community ecology. Platforms often host multiple online groups with highly overlapping topics and members. How can researchers and designers understand how interactions between related groups affect measures of group health? Inspired by population ecology, prior social computing research has studied competition and mutualism among related groups by correlating group size with degrees of overlap in content and membership. The resulting body of evidence is puzzling as overlaps seem sometimes to help and other times to hurt. We suggest that this confusion results from aggregating intergroup relationships into an overall environmental effect instead of focusing on networks of competition and mutualism among groups as our approach does. We compare population and community ecology analyses of online community growth by analyzing clusters of subreddits with high user overlap but varying degrees of competition and mutualism.
+}
+
+%%
+%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
+%% Please copy and paste the code instead of the example below.
+%%
+% \begin{CCSXML}
+% <ccs2012>
+%  <concept>
+%   <concept_id>10010520.10010553.10010562</concept_id>
+%   <concept_desc>Computer systems organization~Embedded systems</concept_desc>
+%   <concept_significance>500</concept_significance>
+%  </concept>
+%  <concept>
+%   <concept_id>10010520.10010575.10010755</concept_id>
+%   <concept_desc>Computer systems organization~Redundancy</concept_desc>
+%   <concept_significance>300</concept_significance>
+%  </concept>
+%  <concept>
+%   <concept_id>10010520.10010553.10010554</concept_id>
+%   <concept_desc>Computer systems organization~Robotics</concept_desc>
+%   <concept_significance>100</concept_significance>
+%  </concept>
+%  <concept>
+%   <concept_id>10003033.10003083.10003095</concept_id>
+%   <concept_desc>Networks~Network reliability</concept_desc>
+%   <concept_significance>100</concept_significance>
+%  </concept>
+% </ccs2012>
+% \end{CCSXML}
+
+% \ccsdesc[500]{Computer systems organization~Embedded systems}
+% \ccsdesc[300]{Computer systems organization~Redundancy}
+% \ccsdesc{Computer systems organization~Robotics}
+% \ccsdesc[100]{Networks~Network reliability}
+
+%%
+%% Keywords. The author(s) should pick words that accurately describe
+%% the work being presented. Separate the keywords with commas.
+% \keywords{datasets, neural networks, gaze detection, text tagging}
+
+%% A "teaser" image appears between the author and affiliation
+%% information and the body of the document, and typically spans the
+%% page.
+
+% \begin{teaserfigure}
+%   \includegraphics[width=\textwidth]{sampleteaser}
+%   \caption{Seattle Mariners at Spring Training, 2010.}
+%   \Description{Enjoying the baseball game from the third-base
+%   seats. Ichiro Suzuki preparing to bat.}
+%   \label{fig:teaser}
+
+% \end{teaserfigure}
+
+%%
+%% This command processes the author and affiliation and title
+%% information and builds the first part of the formatted document.
+
+
+% \fontsize{12pt}{24pt}
+% \selectfont
+
+%% We're going for a "known puzzle" + "clarifying confusion" framing
+%% Rememver to frame aronud the depvar 
+
+%% TODO: rewrite with a new outline
+%% Introduction, Related Work, Materials & Methods, Results, Discussion, Conclusions
+%% Put research question in the introduction. 
+%% Put hypotheses in Related Work.
+%% Consider Hypothesizing that mutualism will be more common than competition because subreddits in these clusters are specialized.
+%% Cut unneeded ecological terms
+%% Define needed ecological terms
+
+\section{Introduction}
+\label{sec:intro}
+
+% Why we need an ecological approach
+%Online groups are important places where people collaborate to produce information sources, engage in discussions and participate in culture.
+Although the fact is frequently ignored in social computing scholarship, online groups do not exist in isolation.\footnote{We use the term ``online group'' instead of ``online community'' to help avoid confusion with our term ``community ecology'' which plays an important conceptual and analytic role in our paper.} Indeed, although studying interdependence between online groups is different and complex \citep{hill_studying_2019}, research in social computing has sought to quantify how online groups share users or topics \citep{datta_identifying_2017, del_tredici_semantic_2018, tan_all_2015, hessel_science_2016}, and how such interactions relate to outcomes like the emergence of new groups \citep{tan_tracing_2018}, contributions to peer-produced knowledge \citep{vincent_examining_2018}, and the spread of hate speech \citep{chandrasekharan_you_2017}.  Although this work has demonstrated that intergroup interactions matter very little intergroup research has tackled questions of group success---i.e., why some online groups succeed in maintaining active and long-lived participation while most do not.
+%\citep{kraut_role_2014, resnick_starting_2012}. % commented out since there was no response
+Can intergroup relationships 
+% competition or mutualism between online groups 
+explain whether online groups will grow or decline?
+% NOTE: I guess you've added the footnote above to address the reviewer concern. It's important but (a) I think it's too early in the manuscript to bring this in and (b) it should be in a footnote. -mako
+% I moved it below by the RQ. 
+
+%a growing body of social computing research shows that online groups, such as wikis, discussion forums and mailing lists spawn new groups and wage conflicts against, compete with and help each other citep{datta_identifying_2017, tan_tracing_2018, wang_impact_2012, zhu_impact_2014}.  
+
+% individual chances of success while mutualistic dynamics increase them. 
+
+% How do relationships between groups shape their chances of success? 
+
+% What's wrong with previous ecological approaches
+% Should we introduce ecological theory in the introduction at all?
+
+Studies in social computing have drawn from organizational ecology to answer this question \citep{wang_impact_2012, zhu_impact_2014, resnick_starting_2012, zhu_selecting_2014}.  Inspired by the ecological study of biological systems, organizational ecology is an influential body of theory in sociology that studies competition and mutualism among human organizations
+% , ranging from commercial industries to social movements  \citep{hannan_population_1977, baum_ecological_2006}. 
+% NOTE: There's a jump between this sentence and the last one. I think we might need to signal, somehow, that orgecol is not puzzling or the results in soccomp are puzzling in regards to them. I've changed puzzling below to inconsistent but we should make it clear what it's inconsistent with. -mako
+Although ecological studies of firms and social movements have developed a clear and established body of theory with strong empirical support \citep{baum_ecological_2006}, similar studies of online groups have yielded inconsistent results that differ both from one context to another and from theoretical predictions. For example, wikis whose memberships overlap with other wikis survived longer \citep{zhu_selecting_2014}, but Usenet groups with overlapping memberships failed more quickly \citep{wang_impact_2012}. 
+
+% NOTE: I'm not sure conflation is the right term here. I've reworked this paragraph below -mako
+% I think you nailed it. -- nate
+We argue that these confusing results are the result of a conflation of concepts and measures from two distinct strands of theory in organizational ecology: \emph{population ecology} and \emph{community ecology}. Both define competition as a form of interdependence that \emph{decreases} growth and mutualism as one that \emph{increases} growth.  However, population ecology focuses on modeling the how overlapping resources among groups affect their subsequent growth, decline, or survival \citep{astley_two_1985, baum_ecological_2006, dobrev_dynamics_2001}. It does not attempt to directly study competitive and mutualistic interactions. On the other hand, community ecology recognizes that groups often exist within ``ecological communities,'' or clusters of highly related entities, and provides an approach for inferring competitive and mutualistic interactions among these. Although the stated goal of ecological research in social computing  has been to understand how groups influence each others' ability to sustain participation, ecological research in social computing has relied exclusively on concepts and measures from population ecology. This paper seeks to explain the puzzling set of findings in ecological social computing research by introducing community ecology.
+
+%These strands have different concepts of ecological dynamics, different levels of analysis and make distinct theoretical predictions \citep{astley_two_1985}.  
+% despite the fact that doing so is vital to 
+
+
+
+% Our contributions to CSCW are theoretical, methodological, and empirical.  
+
+% Our theoretical contribution, articulated in §\ref{sec:community_ecology},  
+
+% We then demonstrate both approaches by investigating our research question: 
+% \textit{(\textbf{RQ}) How does community ecology's view of competition and mutualism in online groups compare to that of population ecology?}
+
+% Our overarching goal is to introduce community ecology as a theoretical and methodological framework for understanding how the relationships between specific online groups shape their growth or decline. 
+
+We do so in a three-part empirical study using a dataset drawn from the 10,000 communities on Reddit with the most contributors to analyze 641 clusters of online groups with overlapping participants. 
+In Study A, we conduct the most important type of population ecology analysis, a test of what is called density dependence theory, and find support for the theory. 
+%This  suggests that competition is strongest when user overlap is high and mutualism is weakest when overlap is low. 
+This analysis suggests that high degrees of user overlap are associated with competition. 
+%VAR models are widely used in biological ecology to make inferences about competitive or mutualistic interactions between species. 
+In Study B, we introduce our method for community ecology analysis that infers networks of competitive and mutualistic interactions by using clustering analysis and vector autoregression (VAR) models of group size over time \citep{sims_macroeconomics_1980, canova_var_2007, ives_estimating_2003}. We illustrate the method in four case studies and present a large-scale computational analysis showing that mutualistic interactions are far more common than competitive ones.    
+Finally, in Study C, we bring Study A and Study B together to compare population ecology and community ecology by extending the density dependence model from Study A with a variable accounting for competition and mutualism. While we find that adding this variable does not help predict growth, including ecological interactions in our VAR models improves time series forecasting. 
+
+% importance of accounting for mutualistic and competitive interactions in predicting the growth of online groups.  We
+
+% While models including , .
+
+We discuss how these findings illuminate the differences between population ecology and community ecology and show how the two perspectives are complementary.
+While Study A suggests that competition is strongest when user overlap is high, Study B finds widespread mutualism among groups with overlapping membership.
+Although these findings might seem contradictory, they reflect how population ecology studies overlapping resources related to favorable or unfavorable environmental conditions, while community ecology studies competitive and mutualistic interactions playing out in local networks of specific groups. By demonstrating that mutualistic and competitive interactions within clusters of highly related groups are important---and by describing how to measure them---this paper lays the groundwork for future research to investigate and design for interdependence between online groups that supports their growth and success. 
+
+%we demonstrate that interactions are important and how to  inferred and are useful for time series forecasts of 
+
+% and inform design
+
+% by understanding
+
+%lays the groundwork for future research toward design
+
+% understanding how different forms of 
+
+
+
+
+
+% To answer this question,  We validate our approach by showing in §\ref{sec:res.forecasting} that
+
+% % NOTE: Is it (1) the top 1000? It would be nice to summarize the comprehensiveness here. (2) I'm ambivalent about the word "network" here. -mako 
+% We make four specific empirical contributions:   Reddit in §\ref{sec:res.characterizing} and .  
+
+% and provide an explanation for why previous ecological research in social computing has led to confusing and inconsistent results. 
+
+
+
+% NOTE: Is the sentence below correct? I guess so (at least indirectly) but I haven't read the new discussion. -mako New discussion isn't written yet, but right now that explanation is in the background section. :) -N
+
+% NOTE: cut this last sentence? -mako - I think this last sentence will be a more accurate reflection of the discussion.  -N
+%  We 
+
+%  We 
+
+% We make a theoretical contribution by introducing the community ecology perspective  that We also make a methodological contribution by providing a method for inferring these relationships from time-series data on group sizes 
+
+% Where prior approaches aggregate individual relationships between groups, our approach makes it possible to answer critical questions like ``which are a given online group's  mutualists or competitors?'' 
+
+% In the process, our theoretical work brings clarity to a confusing set of empirical results in prior research.
+
+%Discussing this seemingly contrasting finding motivates future investigations into how competitive or mutualistic ecological communities form and why some environments for online groups are competitive or mutualistic. 
+
+%  This method builds on a popular approach in biology that provides robust inferences about networks of ecological relationships. , analysis of stability, forecasts of future participation, and can scale to analyze systems of dozens of related communities. We apply this approach to four datasets. 
+
+% We validate our method using simulated data to show that it can identify a full range of ecological relationships and conduct a series of three case studies of groups hosted on the platform Reddit in \textsection \ref{sec:case.studies}. Although limited, these case studies make a third contribution in the form of empirical findings that suggest that specific patterns of relationships vary substantially across networks of groups and that mutualism appears to be much more common than competition.
+
+\section{Related Work}
+\label{sec:related.work}
+
+% One sentence on "timeliness." Find citations (Chowdry, Benkler, 
+Online groups are important sites for social support \citep{de_choudhury_mental_2014}, entertainment \citep{ducheneaut_alone_2006}, information sharing \citep{benkler_wealth_2006}, and political mobilization of disinformation campaigns and protest movements \citep{choudhury_social_2016, benkler_social_2013, krafft_disinformation_2020}.
+% knowledge of the ecosystem of online groups is important for advancing social science and informing future designs to support and manage online groups. 
+Although an online group's ability to achieve its goals depends on attracting and retaining contributors, few develop a sizable group of participants \citep{benkler_wealth_2006, dimaggio_social_2001, johnson_emergence_2014, koh_encouraging_2007, kraut_role_2014}. Many attempts to explain the success and growth of online groups look to properties of individual groups like characteristics of founders \citep{kraut_role_2014}, language use \citep{danescu-niculescu-mizil_no_2013}, turnover \citep{dabbish_fresh_2012}, and designs for regulating behavior \citep{halfaker_rise_2013, teblunthuis_revisiting_2018}.
+
+Recent research suggests that interdependence among online groups is also important to explain success and failure \citep{cunha_are_2019, kairam_life_2012, tan_all_2015, tan_tracing_2018}. 
+For example, banning hate subreddits reduced hate speech in related subreddits \citep{chandrasekharan_you_2017}. In a very different context, there is evidence that Reddit and Stack Overflow receive substantial benefits from activity on Wikipedia \citep{vincent_examining_2018}.
+% ; and editors make valuable and qualitatively different contributions across different languages of Wikipedia \cite{hale_cross-language_2015}. In addition, growth trajectories of online groups initially about similar topics can diverge \cite{zhang_understanding_2021}. 
+Our work contributes to this literature by providing a new conceptual lens and statistical method for studying competition and mutualism between online groups. 
+
+% , which theorizes how online groups depend on distinct types of resources.
+% As we discuss in §\ref{sec:rdp}, the nature of these resources makes possible conditions for mutualism or competition.  In §\ref{sec:ecology_background}, we explain how prior ecological studies of online groups extended RDT to consider how overlapping resources between communities can drive competition and mutualism and propose our first hypothesis which replicates part of these studies in Reddit, our empirical context.  Finally, in §\ref{sec:community_ecology}, we draw anew from biology and organizational ecology to present our community ecology approach and propose hypotheses to validate its usefulness for predicting the growth of online groups.
+
+\subsection{Online Groups Depend on Resources}
+\label{sec:rdp}
+
+Like prior ecological research in social computing and information systems, we build on resource dependence theory (RDT) \citep{butler_membership_2001, wang_impact_2012}. 
+\citet{butler_membership_2001} introduces
+RDT to argue that growth in online groups is driven by positive feedback as participants contribute resources such as content, information, attention, or social interactions, which motivate further contributions by subsequent participants. That said, online groups do not grow forever and RDT explains that growth is self-limiting because costs of participation increase in larger groups \citep{butler_membership_2001, butler_attraction-selection-attrition_2014}.
+
+
+% While growth far from the only criteria of success for an online group, much social computing research follows RDT by seeking to support groups' growth and survival through the attraction or retention of members \cite{koh_encouraging_2007, kraut_role_2014, cunha_are_2019}. 
+
+% For example, explanations of Wikipedia's transition from growth to decline  structures for quality assurance in a growing project that constituted barriers to newcomer participation \cite{halfaker_rise_2013, teblunthuis_revisiting_2018} spawned significant interest in designs for increasing newcomer retention that have met with limited success \citep[e.g.][]{halfaker_snuggle:_2014, morgan_tea_2013, narayan_wikipedia_2017}. Social structures like leadership, organizational practices, network structure, and design decisions can lower costs and increase benefits of participation \cite{butler_membership_2001, kraut_role_2014, tsugawa_impact_2019}. 
+
+
+%TODO: incorporate the below citations to "demonstrate that this is of importance to the social computing audience""  Also cite Charlie's paper about cross-platform interdependence
+
+%We review this foundational work in §\ref{sec:resource_dep} and then narrow our focus to prior ecological studies and other empirical work about interdependence between online groups in §\ref{sec:ecology_background}. Then, in §\ref{sec:community_ecology} we review sociological research developing community ecology theory and apply it to online groups.  
+  
+% It also builds closely on two bodies of ecological theory: first, explanations from population ecology that describe entities as sharing resources in environments and second, explanations from community ecology that theorize networks of specific community relationships.
+% In our background we introduce the first two bodies of related work in sections \ref{sec:resource_dep} and \ref{sec:ecology_background}.
+    
+    % Frame around the dependent variable: 
+    
+    % Explaining participation is important because 
+    % 1. It's a longstanding concern of the field
+    % 2. Online Groups are important to society  
+    % models 
+    % ranging from entertainment, information exchange, social interaction, to the collaborative production of knowledge and organization of collective action
+
+
+% This positive feedback between the value of prior contributions and the motivation for future contributions drives community growth.  
+% Think about the implications of our findings for the rival vs nonrival resources that could be in play.
+
+% Maybe try to deepen the discussion of resource competition, or maybe its better to avoid getting dragged into this.
+
+Ecological approaches recognize that interrelated online groups may share resources with one another in ways that constrain their growth and survival. \textit{Rival} resources like participants' time, attention, and efforts raise the possibility of competition because they become unavailable to others when used by one group \citep{benkler_wealth_2006, kubiszewski_production_2010, ostrom_public_1977,romer_endogenous_1990}. RDT suggests that declines in online participation can be explained in terms of competition over important rival resources \citep{wang_impact_2012}.
+% Online participation in general has opportunity costs and may compete with alternatives like sleep, entertainment, or work \cite{becker_theory_1965, butler_attraction-selection-attrition_2014}.
+% So online groups that provide similar benefits may be the most likely competitors because once someone has obtained satisfying benefits from one group they may go offline or switch to another activity instead of seeking similar benefits from competitor groups.\footnote{Economists refer to these as ``substitutes.' }
+
+% providing the same benefits at lesser costs might be a compelling alternative.
+% If different online groups can substitute for participation in one another and participation is rival this will lead to competition between the communities and decrease participation in both.
+% Public goods are nonrival because their usefulness is not diminished when others use them.
+
+On the other hand, online groups also rely on \textit{nonrival} resources. They can even produce connective and communal public goods like opportunities to communicate or collections of information \citep{fulk_connective_1996} which can be ``antirival'' when their usefulness increases as a result of others using them \citep{kubiszewski_production_2010, weber_political_2000}. For example, the usefulness of a communication network increases as more people join it \citep{fulk_connective_1996, katz_network_1985}. Similarly, the usefulness of an information good can increase as more people come to know, refer to, and depend upon it \citep{kubiszewski_production_2010, weber_political_2000}.
+% as when 
+%Awareness that an online group provides an audience can motivate participation  \cite{zhang_group_2011}. 
+If multiple online groups help build the same connective or communal public goods, they may form mutualistic interactions where contributions to one group may ``spill over'' and motivate participation in mutualist groups \citep{zhu_impact_2014}.   
+Ecological approaches seek to understand how different types of resources will limit or promote growth.
+% as was demonstrated when Chinese government blocked the Chinese language edition of Wikipedia, unblocked contributors decreased their participation 
+%
+
+
+%As a result, researchers, designers, and managers of online communities often set aside thorny questions of interdependence between online communities.  
+%While extensions of the resource dependence framework recognize the importance of exit from online communities \cite{butler_attraction-selection-attrition_2014}, they do not say where people go when they leave.  % Before turning to our theory of community ecology, we note differences between ecological theory and analysis in organization and biological science from  other uses of the term ecology in HCI and social computing. 
+% The term ``ecology'' often connotes interconnectedness, complexity, growth, and nature, and also crises of resource sustainability, loss, and extinction \cite{worster_natures_1994, blevis_ecological_2015}.  Most references technologists make to ``ecology'' 
+% For example Nardi and O'Day invoke the ecological metaphor in describing their vision for individuals to cultivate intentional and localized relationships with technology \cite{nardi_information_2000, bowker_bonnie_2001}.   
+% This continues a long-running intellectual exchange between social and biological sciences.  Economic thought was strongly influenced by Darwinian evolution and ecologists in biology were influenced by economic models to understand and solve problems in forestry and conservation \cite{kropotkin_mutual_2012, worster_natures_1994}. Once modern ecological science was developed it was not long before it was applied to understand human societies \cite[e.g.][]{park_human_1936, hawley_human_1986}.  Because theories of organizational ecology were crafted to address particular concerns in organization science and are laden with assumptions appropriate to traditional firms with fixed and durable boundaries, our ecological approach also draws from biology.
+
+% TODO This section needs a number of new concrete examples.  Revisit the ecological literature as well.  Also perhaps add some examples from the interview paper (which we'll cite and anonymize).    
+\subsection{Population Ecology, Density Dependence and Overlapping Resources}
+\label{sec:ecology_background}
+
+% Our theoretical approach draws from ecology. 
+While this paper focuses on the ecological study of online groups, other social computing and HCI scholars have used the term ``ecology'' (and related concepts like ``ecoystem'' and ``environment'') to denote an assemblage of sites, devices, or platforms \citep{nardi_information_1999,wang_coming_2015}. We use the term more narrowly to refer to conceptual and mathematical models of ecological dynamics. 
+In particular, our work builds on a tradition rooted in \textit{organizational ecology}. First developed in the late 1970s by sociologists studying interactions between firms, organizational ecology was inspired by, and has drawn closely from, ecological studies in biology \citep{hannan_population_1977}. 
+
+Because online groups bear similarities to traditional organizations, organizational ecology provides a compelling theoretical framework for  understanding interdependence among online groups.  It has inspired at least three high-quality empirical studies of how resources shared by online groups shared shape their growth, decline, or survival \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}.
+These studies draw from the \textit{population ecology} strand of organizational ecology
+%, while we introduce \textit{community ecology} as an alternative. 
+that studies ecological dynamics within a population of groups. In organizational ecology, populations have been defined as sets of organizations sharing an organizational industry or business model \citep{hannan_organizational_1989}.  In social computing, populations have been defined as online groups sharing a given social media platform \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}.
+
+While population ecology involves several distinct theoretical propositions, \textit{density dependence theory} (DDT) is perhaps the most prominent and is the subject of all three prior ecological studies of online groups \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}.  DDT models competitive or mutualistic forces in a population of groups as a function of \textit{density} which, in the earliest and most influential studies of DDT, is simply the size of the population. In this way, DDT assumes that every group in the population is facing the same competitive and mutualistic pressures \citep{aldrich_organizations_2006}.
+However, online groups sharing a platform have diverse topics \citep{kairam_life_2012}, norms \citep{chandrasekharan_internets_2018, fiesler_reddit_2018}, and user bases \citep{tan_all_2015}. Because groups sharing few resources are unlikely to be strongly interdependent, ecological studies of online groups have modeled density dependence based on the concept of \emph{overlap density} \citep{baum_ecological_2006, dobrev_dynamics_2001, wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}. Rather than the number of groups that exist in a population, overlap density measures the extent to which an one group's members or topics overlap with all other groups'. Overlap density thus characterizes a group's \emph{niche} or local \emph{resource environment} defined by its distinctive topic and membership.
+
+
+%Unlike \citet{datta_identifying_2017}, we do not divide user frequency by the number of subreddits where the user appears because we do not wish to assume that users who comment in many subreddits are less ecologically important.
+
+%Overlap density is thus not a property of a population of groups, but a property of the resource environment a particular group faces. 
+
+
+% While foundational studies of density dependence in organizational research measu
+% red density and growth at the population level, ecological studies of online groups .\footnote{Although it is less common in organizational research, overlap density has also been used by some organizational ecologists \cite[e.g.][]{dobrev_dynamics_2001}.}  
+% Are  this paragraph and the next one necessary or just confusing?
+DDT proposes a model for the growth of organizational populations that has a similar structure to \citet{butler_membership_2001} RDT model for the growth of online groups.
+In DDT, mutualism is the engine of positive feedback driving population growth. Organizational ecologists show how successful organizations in an emerging industry develop nonrival resources like the legitimacy of a business model or industrial know-how that attract new organizations to enter the market \citep{carroll_density_1989,hannan_organizational_1989}. Similarly, a population of online groups, such as those sharing a platform, may grow in size as their platform gains in popularity, as established groups spin off new ones, and as useful knowledge develops that can be shared between groups \citep{tan_tracing_2018, zhu_impact_2014}.
+
+
+% TODO add a footnote to show the analytical equivalence between the models and connection to Malthus.  
+In RDT, growth of online groups is self-limiting because of the challenges in managing large groups \citep{butler_membership_2001}. In DDT, competition among population members over rival resources limits growth \citep{hannan_organizational_1989}. DDT thus proposes a trade-off in which low density reflects limited opportunities for mutualistic contributions of nonrival resources like legitimacy, connectivity, and knowledge, but high density reflects competition over rival resources.  
+Therefore, DDT predicts that the relationship between density and positive outcomes like growth or survival is  $\cap$-shaped (inverse-U-shaped) \citep{baum_ecological_2006, carroll_density_1989}.
+
+% Save the potential conflict between RDT and DDT for the discussion
+% An individual online group's growth may be limited by the ability of their social structures to scale to include more members  \citep{butler_membership_2001}  or due to competition with other groups over members \citep{hannan_organizational_1989}.
+
+%In a homogenous population or in cases where litt
+%Population ecologists have used a number of definitions of population, but they often refer to sets of organizations having the same organizational form or business model. 
+
+%This is because many environments present a trade-off between mutualism and competition:  mutualistic forces are stronger when density is low and competitive forces are stronger when density is higher.  The intuition is that low-density environments reflect poor environmental conditions for success---if conditions were good then they would attract more growing communities hence be more dense. On the other hand, high-density environments are thought to become crowded and competitive \citepp{hannan_organizational_1989}.
+
+Tests of DDT in populations of online groups yield inconsistent results. In \citet{wang_impact_2012}, user overlap in Usenet newsgroups is associated with decreasing numbers of participants. Similarly, \citet{teblunthuis_population_2020} find that topical overlaps between online petitions are negatively associated with participation. By contrast, \citet{zhu_impact_2014} find that membership overlap is positively associated with increasing survival of new Wikia wikis. Only \citet{zhu_selecting_2014} find support for the $\cap$-shaped relationship predicted by DDT in an enterprise social media platform.
+
+In Study A, we provide a test of DDT using data from Reddit. The classical logic of DDT appears reasonable in the context of Reddit because low overlap density is likely to reflect an impoverished environment lacking in non-rival resources like skills and knowledge of experienced users, while a group with high overlap is likely to face competition over its members \citep{zhu_selecting_2014, zhu_impact_2014}:
+\textit{(\textbf{H1}) The relationship between overlap density and the growth of online groups is  $\cap$-shaped (inverse-U-shaped).}
+% such as the 
+
+%DDT sees competition and mutualism as environmental properties of an online group's niche. 
+
+DDT proposes that very high levels of density will decrease growth because of increasing forces of competition within a niche. However, to conclude that groups with the greatest membership overlap are likely competitors would be to commit a well-known statistical fallacy
+% (the term ecological fallacy does not refer to theories of population or community ecology, but rather to ``ecological correlations,'' meaning correlations involving aggregates)
+\citep{piantadosi_ecological_1988, robinson_ecological_1950}.
+The density of a group's environment suggests that it faces competition or mutualism, but it does not tell us which overlapping communities are competitors and which are mutualists.
+% DDT therefore relates resource overlaps to the growth of online groups, yet stops short of inferring competitive or mutualistic interactions among them. It does not provide a way of learning when and why groups are mutualists or competitors and this limits its ability to inform designs that take these interactions into account. 
+Community ecology overcomes this limitation of DDT.
+
+\subsection{Introducing Community Ecology \label{sec:community_ecology}}
+
+Perhaps the most natural way to understand the distinction between population ecology and community ecology is in where they believe ecological dynamics like competition and mutualism play out \citep{astley_two_1985}.   While population ecology locates competition and mutualism within an environmental niche, community ecology locates competition and mutualism in networks of interdependent groups called \emph{ecological communities} \citep{aldrich_organizations_2006}. In organizational ecology, this can mean studying interactions between different organizational populations \citep[e.g.][]{sorensen_recruitment-based_2004, mcpherson_ecology_1983}, or networks of interactions between organizations \citep[e.g][]{powell_network_2005, margolin_normative_2012}. 
+%Doing so makes visible the distinctive roles that particular groups play. 
+While varying conceptions of community ecology are found in the organizational ecology literature \citep{freeman_community_2006}, the approach we describe is identical in structure to that taken by \citet{aldrich_organizations_2006} and \citet{hawley_human_1986}.
+
+Community ecology focuses on \emph{ecological interactions} \citep{aldrich_organizations_2006}.
+%In organizational ecology, these interactions are referred to as ``commensal relationships.'' However, biologists use the term  ``commensal'' quite differently  to mean an unreciprocated mutualistic interaction in which one species provides benefits to another while being unaffected by it.  While for the most part, we draw our conceptions and terminology from organizational ecology rather than biology, the use of the term ``commensalism'' in organizational ecology can be confusing.  We therefore adopt the term ``ecological interaction.''
+Ecological interactions can be mutualistic when one group has a positive influence on the second such that growth in the first group leads to growth in the second.  They can also be competitive if one group has a negative effect on the second such that growth in the first group leads to decline in the second. Ecological interactions can be reciprocated if mutualism (or competition) from one group to another group is returned in kind. An ecological interaction can also be mutualistic in one direction and competitive in the other.  The competitive or mutualistic interactions in an ecological community are quantified by the \emph{community matrix}, a central analytical object in community ecology in both biology and organization science \citep{verhoef_community_2010, novak_characterizing_2016, aldrich_organizations_2006}. 
+
+In Study B, we demonstrate community ecology by inferring networks of ecological interactions in ecological communities on Reddit. Because our understanding of community ecology theory does not suggest hypotheses about what we will find, we conduct an exploratory data analysis to determine whether mutualism or competition among subreddits is more common on Reddit and present case studies illustrating the types of ecological communities we identify.
+
+%So a commensal relationship exists between each pair of groups in an ecological community.
+
+% There are six possible ecological interactions as described in Table \ref{tab:interaction.types}.  Note that they can be reciprocal (as in full mutualism and competition) or not (as in partial mutualism and competition). In our framework ``predation'' is an interaction that is positive in one direction but negative in the other. It is also possible that growth or decline in the first group has no effect on the second group, and visa-versa, a situation termed ``neutrality.''
+
+
+% \begin{table}
+%     \caption{The five possible ecological interactions between two online groups. Values in the column ``i $\rightarrow$ j'' represent the sign of $\phi_{i,j}$ group i's effect on group j.  Based on table 11.1 from \citet{aldrich_organizations_2006}.}
+%     \centering
+%     \begin{tabular}{c|c|c}
+%     i $\rightarrow$ j ($\phi_{i,j}$)& i $\rightarrow$ j  ($\phi_{i,j}$) & Interaction type  \\ \hline
+%     $+$ & $+$ & Full mutualism \\
+%     $+$ & $\cdot$ & Partial mutualism \\
+%     $+$ & $-$ & Predation \\
+%     $-$ & $\cdot$ & Partial competition \\
+%     $-$ & $-$ & Full competition \\
+%     $\cdot$ & $\cdot$ & Neutrality
+%     \end{tabular}
+%     \label{tab:interaction.types}
+% \end{table}
+
+% by conceiving of community ecology as the study of relationships between different groups.
+
+% Relationships studied in community ecology are defined by how they , but they are also important because networks of relationships 
+%and give rise to higher-order properties like stability. 
+
+%Our community ecology approach instead focus on relationships between communities from overlap density approaches to focuses on relationships between communities as a step toward solving the puzzle. 
+
+%Consider the example of how \citet{zhu_impact_2014} find membership overlap is associated with increasing survival of new Wikia wikis, but in \citepos{wang_impact_2012} study of Usenet groups user overlaps are associated with decreasing group sizes. 
+
+% Consider cutting this since we don't look at any other factors
+
+%study period,  and they found a stronger relationship when overlapping members were from more established groups. Perhaps the growth Wikia wikis was limited by knowledge of how to build a Wiki which was provided by more experienced users and user overlaps were correlated with access to such knowledge.  While 
+
+
+% What's the point of these three paragraphs?
+\subsection{Predicting Growth}
+
+In Study C we build upon our analyses from Study A and Study B by testing whether community ecology can explain the growth and decline of online groups in ways that population ecology can not. We do this by analyzing in two different ways whether accounting for ecological interactions helps predict future group sizes.  
+% We expect it to do so because resource overlaps as modeled by DDT may be a poor proxy for the degree to which a group's environment is competitive or mutualistic. 
+In general, competition for overlapping resources will have no effect on group growth if something besides the overlapping resource limits growth \citep{verhoef_community_2010}. For example, two wikis might share a large number of contributors (they have high user overlap), but their growth might be limited by a lack of core contributors who perform important administrative tasks like policy making and software administration \citep{zhu_impact_2014}.   Community ecology relaxes the assumption that competition and mutualism are caused by user overlap density and instead seeks to infer these relationships from data.  We test the importance of this conceptual shift for predicting growth by testing two hypotheses. The first uses a model comparison approach to  test if adding a measure of ecological interactions to the density dependence model in Study A improves prediction of growth: (\textit{\textbf{H2})  A model with ecological interactions and density dependence predicts growth in online groups better than density dependence alone.}
+
+Support for H2 may be a relatively low bar for assessing whether ecological interactions are important factors shaping the growth of online groups because of confounding moderator or mediator variables related to the occurrence of ecological interactions.
+% For example, suppose mutualistic interactions were correlated with declining ecological communities.
+Therefore, we also use a time series forecasting approach to test whether modeling ecological interactions is useful for making time series forecasts of participation in online groups:
+%We seek to demonstrate in  whether including commensal relationships in time series forecasting models improves forecasting performance.  
+(\textit{\textbf{H3}) The addition of ecological interactions to a baseline time series model improves the forecasting performance.}
+While this does not directly compare population ecology and community ecology, it validates that ecological interactions are important.
+
+%With commensalism, we can seek to explain the puzzling results of resource overlap studies by exploring our second research question:\noindent \textbf{RQ2: How are degrees of user overlap and types of commensal relationships related?} 
+
+% This paragraph isn't helping very much
+% Ecological dynamics play out through the network of such relationships over time as represented by the \emph{community matrix}, $\Phi$.  
+
+
+% Analysis of the community matrix can reveal indirect relationships between groups and properties of an ecological community like stability \cite{ives_estimating_2003}. 
+%Seeing interdependence between online groups through a community ecology-based network of dynamical relationships can make visible special roles that particular groups play in an ecological community through their many mutualistic or competitive relationships.
+
+% Next we take a first methodological step toward answering questions like these by adapting vector autoregression models from biology and macroeconomics as an approach to inferring community matrices.  We then apply our approach in three case studies of related groups hosted on Reddit to reveal three qualitatively different ecological communities. 
+
+%% SOME BIKERACK RAISING MORE ISSUES WITH THE NICHE OVERLAP APPROACH
+
+% study online groups additionally shifts from an analogy of online communities as individual members of a biological species to online communities as species themselves and seeking to understand functional relationships between different online groups. 
+% Yet a closer examination of the analogy to density-dependence in organizational or biological populations reveals conceptual awkwardness.  At issue is the referent of the term ``niche.''  Should we use ``niche'' to refer to a set of resources that an online community can utilize?  This is what ``niche'' means in both overlap density and in our version of community ecology.
+
+% Social exposure is also important, but we don't deal with that in this .  The idea here is that the cost-benefit structure depends on alternatives which can lower costs or . 
+%VAR analysis can quantify the stability of the system and affords exploration of counterfactual forecasts to simulate hypothetical interventions \citep{ives_estimating_2003}.  
+
+
+\section{Materials \& Methods}
+\label{sec:methods}
+
+
+
+% The presentation of our materials and methods is organized as follows: First we introduce the methods and measures for Study A, beginning with 
+% \emph{user overlap} %(§\ref{sec:mes.overlap})
+% which is aggregated into \emph{overlap density} %(§\ref{sec:mes.density}) 
+% to predict subreddit \textit{growth} %(§\ref{sec:mes.growth}) 
+% in a loglinear regression model. Then, for Study B, we present 
+% our clustering procedure for identifying ecological communities % (§\ref{sec:clustering}) 
+% on which we fit VAR models % (§\ref{sec:var}) 
+% predicting \emph{group size}. % (§\ref{sec:mes.group.size}). 
+% To explore the types of ecological communities found on Reddit, we derive two measures from these models for each cluster: \emph{average ecological interaction} 
+%(§\ref{sec:mes.avg.mut}) 
+% which quantifies the degree of competition and mutualism in the ecological community and \emph{ecological interaction strength} %(§\ref{sec:mes.abs.int}) % which quantifies its overall intensity of ecological interactions. Next, we draw competition-mutualism networks in example ecological communities based on interpreting the VAR models using impulse response functions (IRFs) %(§\ref{sec:mes.irf}). 
+% Then, in Study C, we test H2 to compare community ecology and density dependence theory by adding \emph{subreddit average mutualism} %(§\ref{sec:mes.sub.mut}) 
+% to the model from Study A. Finally, we test H3 by evaluating whether including ecological interactions in the VAR models improves time series forecasting. % (§\ref{sec:mes.forecasting}).
+
+\subsection{Data}
+
+Our data are drawn from the publicly available Pushshift archive of Reddit submissions and comments which we obtained from December 5\textsuperscript{th} 2005 to April 13\textsuperscript{th} 2020
+\citet{baumgartner_pushshift_2020}. Within this dataset, we limit our analysis to submissions and comments from the 10,000 subreddits with the highest number of comments. There are 702 subreddits larger than the smallest subreddit included in our dataset having a majority of submissions marked ``NSFW,'' which typically indicates pornographic material. As others have done in large-scale studies of Reddit \citep[e.g.,][]{datta_identifying_2017}, we exclude these subreddits to avoid asking members of our research team to inspect clusters including pornography. The top 10,000 subreddits provide a sufficiently large number of ecological communities for our statistical analysis. 
+
+\subsection{Study A: Density Dependence Theory} % and Community Ecology}
+\label{methods:density}
+
+
+\subsubsection{User overlap \nopunct} \label{sec:mes.overlap} 
+ $o_{i,j}$ quantifies the degree to which two subreddits ($i$ and $j$) share users. 
+ %From it we construct clusters of related groups in §\ref{sec:clustering} and quantify overlap density in §\ref{sec:mes.density}.  
+\citet{zhu_impact_2014} and \citet{wang_impact_2012} both measure user overlap between two groups by counting the number of users contributing to both groups at least once and exclude users who appear in more than 10 groups. In our preliminary analysis, we found that this measure led to similarity measures and clusters with poor face validity.  These issues may have stemmed from how Reddit users often peripherally participate in many groups while participating heavily in few \citep{tan_all_2015, hamilton_loyalty_2017, zhang_community_2017}. Therefore, our measure of user overlap follows  \citet{datta_identifying_2017} by using the number of comments each user makes in each pair of groups.
+
+To measure user overlap between subreddits, we first build user frequency vectors by counting the number of times each user comments in each subreddit. We prevent giving undue weight to subreddits with higher overall activity levels by normalizing the comment counts for each subreddit by the maximum number of comments by a single author in the subreddit:
+
+\begin{equation}
+    f_{u,j} = \frac{n_{\mathrm{u,j}}}{max_{v\in\mathrm{J}}n_{v,j}} \label{eq:user.frequency}
+\end{equation}
+
+\noindent where $n_{u,j}$,  the user frequency, is the number of times that user $u$ authors a comment in subreddit $j$.
+
+This results in a user frequency vector $F_j$ for each subreddit that is sparse and high-dimensional, having one element for each user account that comments in any subreddit in our dataset.
+% In the course of developing our clustering analysis described in §\ref{sec:clustering}, we found that following an approach analogous to latent semantic analysis (LSA) improved the quality of our clusters. 
+Next, we use LSA to reduce the dimensionality of the user frequency vectors. 
+LSA is based on the singular value decomposition and is common in natural language processing and information retrieval. LSA preserves subreddit similarities while removing noise and dealing with sparsity \citep{dumais_latent_2004}:
+
+\begin{align}
+    \mathbf{F} &= \mathbf{U \Sigma V}^T \\ \nonumber
+    \widetilde{F_{j}} &= \mathbf{U_k}^TF_j \label{eq:user.frequency.svd}
+\end{align}
+
+\noindent $\mathbf{F}$ is the matrix where columns are author frequency vectors $F_j$ and $\mathbf{U \Sigma V}^T$ is its singular value decomposition. Truncating the singular value decomposition to use only the first $k$ left-singular vectors gives $\mathbf{U_k}$. Left-multiplying a subreddit's author frequency vector by $\mathbf{U_k}$ transforms the high-dimensional author frequencies into $\widetilde{F_j}$, their approximation in the $k$-dimensional space. 
+% We choose $k=600$ in the course of our grid search for a good clustering described below in §\ref{sec:clustering}.
+
+%clustering with a high silhouette coefficient.
+
+We then obtain our measure of \textit{user overlap} by taking the cosine similarities between the resulting vectors for a pair of subreddits:
+\begin{equation}
+    o_{i,j} = \frac{\widetilde{F_{j}} \cdot \widetilde{F_{i}}} {\norm{\widetilde{F_i}} \norm{\widetilde{F_j}}} \label{eq:user.overlap}
+\end{equation}
+
+\noindent where $\norm{\widetilde{F_i}} = \sqrt{\sum_{x=1}^k \widetilde{f_{x,i}}^2}$ is the euclidean norm of the transformed user frequencies for subreddit $i$.  
+
+
+
+
+%We use the following methods and measures in our tests of our hypothesis that the relationship between user overlap density the growth of online groups is $\cap$-shaped (H1) and our hypothesis that accounting for ecological interactions will help explain growth beyond overlap density (H2):
+
+% We measure \emph{overlap density} and \emph{growth} to and . To test \textit{\textbf{H2}}, we add the overall influence of ecological interactions on a subreddit 
+
+\subsubsection{Growth\nopunct}\label{sec:mes.growth} is the dependent variable in our density dependence model testing H1 and is also used in our test of H2 as part of Study B. Growth is measured as the change in the (log-transformed) size of a subreddit over the final 24 weeks of our data, from to November 4\textsuperscript{th} 2019 to April 13\textsuperscript{th} 2020. 
+
+\subsubsection{Overlap density\nopunct} \label{sec:mes.density} $d_i$ is the normalized average user overlap for a given subreddit. It is the independent variable in our density dependence model testing H1:
+
+\begin{align}\label{eq:user.overlap.density}
+  d^*_{i} &= \frac{1}{\left|S\right|-1} \sum_{j\in R;j\ne i} \mathrm{o}_{i,j} \nonumber \\
+  d_{i} &= \frac{d_i^*}{\mathrm{max}_j d_j^*}
+\end{align}
+
+\noindent where $S$ is the set of groups in our dataset.  
+
+\subsubsection{Regression model for H1} \label{sec:reg.H1}
+To test H1, we fit Model 1 % in Equation \ref{eq:M1}
+which has first and second-order terms for overlap density to allow for a curvilinear relationship between \emph{overlap density} and \emph{growth}.
+\begin{align}
+\mathrm{Model~1} & & Y_i = B_0 + B_1 d_{i} + B_2 d^2_{i}  \label{eq:M1}
+\end{align}
+\noindent where $Y_i$ is the growth of subreddit $i$ and $d_i$ is its overlap density.
+
+
+\subsection{Study B: Introducing Community Ecology}
+
+
+%Here we review the prior work on which we build our methodological approach to inferring competitive and mutualistic relationships between online groups. %\textsection \ref{sec:inferring} describes our own methodological contributions. 
+
+\subsubsection{Clustering to identify ecological communities}
+\label{sec:clustering}
+Analyzing networks of ecological interactions is the key difference between community ecology and population ecology. 
+% In Study A we set out to survey the types of ecological communities found on Reddit to provide a comparison with a large-scale population ecology analysis.
+% in \ref{sec:clustering}
+%Here, we use a heuristic approach based on clustering algorithms to find ecological communities of online groups that all have high user overlap.
+To identify ecological communities of related subreddits, we use a clustering procedure based on the user overlap measure described above in §\ref{sec:mes.overlap}.  
+We selected a clustering model using grid search to obtain a high silhouette coefficient \citep{rousseeuw_silhouettes_1987}. The silhouette coefficient captures the degree to which a clustering creates groups of subreddits with high within-cluster similarity.
+% relative to similarity with subreddits in other clusters. 
+
+Our description of our measure for user overlap in §\ref{sec:mes.overlap} does not explain how we choose the number of LSA dimensions $k$. 
+To do so, we ran the affinity propagation \citep{frey_clustering_2007}, HDBSCAN \citep{mcinnes_hdbscan_2017} and \textit{k}-means clustering algorithms and selected the algorithm, hyperparameters, and LSA dimensions $k$ that resulted in the clustering with a high silhouette coefficient having less than  5,000 isolated subreddits, and at least 50 clusters.  We limit the number of isolated subreddits because some choices of hyperparamters for the HDBSCAN algorithm could improve the silhouette coefficient, but at the cost of greatly increasing numbers of isolated subreddits.  Choosing a relatively high limit to the number of isolates helps ensure that our clusters contain highly related communities. We chose an HDBSCAN clustering with 731 clusters, 4964 isolated subreddits, $k=600$ LSI dimensions, and a silhouette score of 0.48.    
+We exclude the isolated subreddits from our analysis. More details about our clustering selection process are found in the online supplement.
+
+
+%In order to test H2 and answer RQ1, we estimate the community matrix of commensal relationships between selected communities of online groups. 
+We evaluate the external validity of the chosen clustering using the purity evaluation criterion \citep{manning_introduction_2018}
+% :
+% \begin{equation}45
+%     \mathrm{Purity}=\frac{1}{N}\sum_{m\in M}\max_{d\in D}{|m \cap d|}
+% \end{equation}
+% \noindent Where $N$ is the number of clusters $M$, $D$ are ``true'' classes to which subreddits might belong and $max_{d\in D}|m \cap d|$ is the greatest number of subreddits in cluster $m$ that belong to the same class $d$.
+To do so, an undergraduate research assistant examined a random sample of 100 clusters including 744 subreddits.  By visiting the subreddits and using her own judgment, the assistant flagged subreddits that did not seem like a good fit for their assigned cluster. Using these labels and excluding 25 subreddits that have been deleted, made private, or banned, we calculated the purity of our clustering as 0.92. This means that we believe that 92\% of subreddits belong to their assigned cluster.
+% Note that although we clustered subreddits based on user overlap, we obtain a high purity score based on a subjective evaluation of the subreddits' contents. 
+
+%\subsection{Inferring Mutualistic and Competitive Interactions}
+
+% We find f(N.clusters) clusters and f(N.isolates) isolated subreddits. The median cluster has median.cluster.size subreddits and the largest cluster has 
+
+
+\subsubsection{Group size\nopunct} \label{sec:mes.group.size} is the dependent variable of the models we use to infer ecological interactions. Measured as the number of distinct commenting users in a subreddit each week, group size quantifies the number of people who participate in a subreddit over time. Typical of social media participation data, group size is highly skewed. Therefore, we transform it by adding 1 and taking the natural logarithm. 
+
+
+% The following three paragraphs probably belong in the methods section, but I'm trying to satisfy the reviewers.
+\subsubsection{Inferring ecological interactions using Vector Auto Regression}
+\label{sec:var}
+
+The community matrix $\mathbf{\Phi}$ of ecological interactions can be inferred from time series data using vector autoregression models (VAR models). VAR models are a workhorse in biological ecology because VAR(1) models (i.e., VAR models with a single autoregressive term) have a close relationship with the Gompertz of population growth which is widely used in ecology \citep{ives_estimating_2003}. Even in the presence of unmodeled nonlinearities, VAR(1) models can reliably identify competition or mutualism in empirically realistic scenarios \citep{certain_how_2018}. VAR models also been widely adopted in the social sciences, particularly in political science and in macroeconomics \citep{box-steffensmeier_time_2014}. 
+
+% \citet{sims_macroeconomics_1980} advocated VAR modeling in macroeconomics to address a problem in the field as an alternative to structural equation modeling (SEM), which required detailed specification of a large number of theoretical assumptions to identify. 
+%similar to structural equation models but require fewer theoretical assumptions but are
+%VAR models are flexible enough to model a wide range of systems so long as sufficiently long time-series data are available \citep{sims_macroeconomics_1980}.
+VAR(1) models can be intuitively understood as a generalization of auto-regressive AR(1) models in time series analysis. But while AR(1) models predict the state of a single time series as a function of its previous value, VAR(1) models simultaneously predict multiple time series as a function of the values of every other variable in the system \citep{canova_var_2007, ives_estimating_2003}:
+
+\begin{equation}\label{eq:var1}
+Y_t = B_0 + B_1t + \sum_{k \in K}A_k x_{k,t} + \sum_{j \in M}\Phi_{j} y_{j,t-1} + \epsilon_t
+\end{equation}
+
+\noindent where $Y_t$ is a vector containing the sizes of a set of online groups ($M$) at time $t$. $B_0$ is the vector of intercept terms and $B_1$ is the vector of linear time trends ($b_{1,j}$) for each community ($j$). $\Phi_{j}$ represents the influence of $y_{j,t-1}$, the size of the $j^{\mathrm{th}}$ online group at time $t-1$ on $Y_t$. $\Phi_{j}$ is a column of $\mathbf{\Phi}$, a matrix of coefficients in which the diagonal elements correspond to intrinsic growth rates (marginal to the trend) for each online group and the off-diagonal elements are intergroup influences, and $\epsilon_t$ is the vector of error terms
+
+Additional time-dependent predictors ($x_{k,t}$) can be included in the vectors $X_{k}$ with coefficients $a_k$. Because subreddits are created at different times, growth trends must begin only after the subreddit is created. We use $X_{k}$ to introduce a  counter-trend during the period prior to the creation of subreddits so that each group's growth trend begins in the period the group is created. For each group $j$ created at time $t^0_j$ we fill $X_{j}$ with the sequence $[1,2,3,\ldots\ ,t^0_j-1,0,0,0,\ldots\ ]$. In other words, $X_{j}$ adds a counter-trend only during the period prior to the first comment in subreddit $j$. We fix the elements $a_{j,i}$ of $A_j$ equal to 0 unless $i=j$, so the counter trend only influences subreddit $j$. This effectively sets $a_{j,j}$ approximately equal to $-b_{1,j}$. 
+
+We fit VAR(1) models using ordinary least squares as implemented in the \texttt{vars} \texttt{R} package to predict the group size each week using over the history of each subreddit prior to November 4\textsuperscript{th} 2019 \citep{pfaff_var_2008}. We hold out 24 weeks of data for forecast evaluation and fit our models on the remainder. To ensure that sufficient data is available for fitting the models, we exclude 946 subreddits and 89 clusters having less than 156 weeks of activity. 
+
+% where the cluster data lacks the necessary degrees of freedom to fit the model because the length of the training time series is less than the size of cluster plus 2. 
+
+
+% We hold out the weeks from fit.date to to.date for evalution. % Some of the clusters were too large or had too low levels of activity We include only We include a vector of intercept terms (to account for different equilibrium community sizes) and a vector of trends (to account for long-run endogenous growth) because we found that including these terms greatly improved the fit of our models to the data. Our VAR(1) models have this form in vector notation:
+
+%$$ Y_t = \Mu + \Phi_1 Y_{t-1} + \ldots + \Phi_p Y_{t-p} + \epsilon_t $$ 
+% TODO: avoid mixing matrix and vector notation.
+
+\subsubsection{Characterizing ecological communities}
+\label{sec:characterizing.ecological.communities}
+
+In Study B, we interpret the community matrix $\mathbf{\Phi}$ as a directed network of ecological interactions, a \emph{competition-mutualism network} \citep{ives_estimating_2003}. Although the elements of $\mathbf{\Phi}$ correspond to direct associations between group sizes \citep{novak_characterizing_2016}, ecological interactions can also be indirect. Consider 3 one-directional interactions between three groups ($a$, $b$, $c$) such that growth in $a$ predicts decreased growth in $b$ ($\phi_{a,b} < 0$), growth in $b$ predicts decreased growth in $c$ ($\phi_{b,c} < 0$), but $a$ and $c$ do not directly interact ($\phi_{a,c} \approx 0$).
+
+This does not necessarily mean that groups A and C are independent. Rather, an exogenous increase in A predicts a decrease in B and thereby an eventual increase in C.  Such indirect relationships are analyzed by using impulse response functions (IRFs) to interpret a VAR model \citep{box-steffensmeier_time_2014}.  In large VAR models containing many groups, the great number of parameters can mean that few specific elements of $\mathbf{\Phi}$ will be statistically significant, even as many weak direct relationships can combine into statistically significant IRFs \citep{canova_var_2007}. 
+
+\subsubsection{Average ecological interaction\nopunct} \label{sec:mes.avg.mut}  $\overline{m}$ measures the extent to which an overall ecological community is mutualistic or competitive by taking the mean point estimate of the off-diagonal coefficients of $\mathbf{\Phi}$:
+
+\begin{equation}\label{eq:average.interaction}
+\overline{m} = \frac{1}{\left|M\right| - 1} \sum_{i\in M} \sum_{j\in M;j\ne i} \phi_{i,j}
+\end{equation}
+
+\noindent if $\overline{m} > 0$ then mutualistic interactions within the ecological community are stronger than competitive ones, and if $\overline{m} < 0$ then competitive interactions are stronger then mutualistic ones.
+
+\subsubsection{Ecological interaction strength\nopunct} \label{sec:mes.abs.int} $\kappa$ quantifies the overall strength of ecological interactions in an ecological community as the mean absolute value of the point estimates of the off-diagonal coefficients of $\mathbf{\Phi}$:
+
+\begin{equation}\label{eq:average.absolute.interaction}
+\kappa = \frac{1}{\left|M\right| - 1} \sum_{i\in M} \sum_{j\in M;j\ne i} \left| \phi_{i,j} \right|
+\end{equation}
+
+\noindent where $\left| \phi_{i,j} \right|$ is the absolute value of the coefficient $\phi_{i,j}$.
+
+Ecological communities of subreddits with overlapping users vary in both the overall strength of ecological interactions and in the overall degree of mutualism and competition between member groups.  If an ecological community's average ecological interaction is positive, we say the ecological community is mutualistic.  If it is negative, we say the ecological community is competitive. The average ecological interaction can be close to 0 in two ways. First, the ecological interaction strength can simply be low.  Alternatively, the ecological community can have a mixture of competitive and mutualistic interactions that cancel one another out when averaged.  % Such an ecological community can have high ecological interaction strength. 
+
+\subsubsection{Impulse response functions\nopunct}\label{sec:mes.irf} (IRFs) of our VAR(1) models correspond to our visualizations of example competition-mutualism networks in §\ref{sec:case.studies}. An IRF predicts how much each group's size would change in response to a sudden increase in the size of each other group \citep{verhoef_community_2010}:
+
+\begin{equation}
+    \mathbf{\Theta_t} = \mathbf{\Theta_{t-1}}\mathbf{\Phi}, t = 1,2,... \label{eq:irf} 
+\end{equation}
+
+\noindent where $\mathbf{\Theta_t}$ is the impulse response function at time $t$.   $\mathbf{\Theta_0}$ is an $M$-by-$M$ identity matrix so our impulses represent a log-unit increase of 1 to each group. $\mathbf{\Theta_t}$ is a matrix with elements $\theta^t_{i,j}$ corresponding to the response of group $j$ to the impulse of group $i$.  We draw an edge $i \rightarrow j$ in the competition-mutualism network if the 95\% CI of $\theta^t_{i,j}$ does not include zero at any time $10>=t>0$.  If $\theta^t_{i,j} >0 $, the edge indicates mutualism and if  $\theta^t_{i,j} < 0$  the edge indicates competition.\footnote{In higher-order VAR($p$) models that use $p>1$ past observations as predictors $\theta^t_{i,j}$ can be less than 0 for some $t_a$ and greater than 0 for some $t_b$. However, this is not possible in the VAR(1) models we use.}  We compute the IRFs with bootstrapped confidence intervals (CI) based on 1,000 samples using the \texttt{vars} \texttt{R} package.
+
+
+% The community matrix $\Phi$ is interpretable as a network of commensal relationships \citep{ives_estimating_2003}. While the coefficients of $\mathbf{\Phi}$ correspond to direct associations between group sizes \cite{novak_characterizing_2016}, commensal relationships can also be indirect. Consider relationships between three groups (A, B, C) such that A partially competes with B and B partially competes with C but A and C have no direct relationship. A VAR(1) model inferring these relationships will have negative coefficients for $\phi_{AB}$ and $\phi_{BC}$  but $\phi_{AC}$ will be nearly zero. 
+
+% TODO plot the examples on figure 1. 
+
+%The central prediction of density dependence theory is that there will be a curviliear, inverse-U-shaped ($\cap$-shaped) relationship between overlap density and growth.  
+
+\subsection{Study C: Predicting growth}
+
+\subsubsection{Average subreddit mutualism\nopunct}\label{sec:mes.sub.mut} $m_j$ is the independent variable for our test of H2 and measures the average influence of other subreddits in the ecological community on a given subreddit $j$, which we calculate by taking the mean of off-diagonal elements of row $j$ of the community matrix:
+
+\begin{equation}\label{eq:average.subreddit.mutualism}
+m_j = \frac{1}{\left|M\right|-1}\sum_{i\in M;i\ne j} \phi_{i,j}
+\end{equation}
+
+\noindent where $M$ is the set of subreddits in the ecological community and $\left|M\right|$ is the number of subreddits in $M$. We use the mean instead of the sum because different ecological communities have different numbers of subreddits.
+
+\subsubsection{Regression models for H2} We test H2 by using likelihood ratio tests to compare Model 1 % (above in \ref{sec:reg.H1}) 
+and Model 2 % in Equation \ref{eq:M2} 
+which adds \emph{average subreddit mutualism} ($m_i$) as a predictor. We also fit Model 3 % in Equation \ref{eq:M3} 
+which we compare to Model 2 to test if overlap density explains variation that average subreddit mutualism does not.
+
+\begin{align}
+\mathrm{Model~2} & & Y_i &= B_0 + B_1 d_{i} + B_2 d^2_{i} + B_3 m_i \label{eq:M2} \\
+\mathrm{Model~3} & & Y_i &= B_0 + B_3 m_i \label{eq:M3} 
+\end{align}
+\noindent where $Y_i$ is the growth of subreddit $i$, $d_i$ is its overlap density, $m_i$ is its average subreddit mutualism, and $B_0$, $B_1$, $B_2$, and $B_3$ are regression coefficients. 
+
+\subsubsection{Forecasting growth using ecological interactions}
+\label{sec:mes.forecasting}
+To test H3, we evaluate whether modeling ecological interactions improves time series forecasting of future participation in online groups by comparing the model in Equation \ref{eq:var1} to a baseline model with  off-diagonal elements of $\mathbf{\Phi}$ fixed to 0. This baseline model is equivalent to our VAR model, but excludes ecological interactions.
+
+We use two forecasting metrics with differing assumptions: root-mean-square-error (RMSE) and the continuous ranked probability score (CRPS).  RMSE is commonly used, non-parametric, and intuitive, but does not take differing scales of the predicted variable or forecast uncertainty into account.  Thus, in our setting it may place excessive weight on the forecasts of larger subreddits where errors may have greater magnitude simply because the absolute magnitude of the variance is greater.  By rewarding forecasts where the true value has high probability under the predictive distribution, the CRPS accounts for variance in the data and rewards forecasts for both accuracy and precision and is thus a ``proper scoring rule'' for evaluating probabilistic forecasts \citep{gneiting_strictly_2007}. Our CRPS calculations assume that the predictive forecast distribution for each community is normal with standard deviations given by the 68.2\% forecast confidence interval. We calculate CRPS using the \texttt{scoringRules} \texttt{R} package \citep{jordan_evaluating_2019}.
+
+\section{Results}
+\label{sec:results}
+
+% The organization of our results follows that of our methods.  We begin with Study A % (§\ref{sec:res:studyA}) 
+% in which we find, as predicted by H1, that the relationship between overlap density and growth is $\cap$-shaped relationship. Then, in Study B,% (§\ref{sec:res.characterizing})
+% we explore a typology of ecological communities along two dimensions: (1) the degree to which a community is mutualistic or competitive, and (2) the overall strength of ecological interactions between the communities member groups. In the N.clusters ecological communities analyzed in our VAR(1) analysis, we find that mutualistic relationships are much more common than competitive ones. Our case studies % (§\ref{sec:case.studies}) 
+% illustrate the typology using 4 example ecological communities.  Finally, in Study C, we do not find support for H2 %in §\ref{sec:res.likelihood.ratio.test} 
+% as adding average subreddit mutualism to the density dependence model does not improve growth prediction. But we do find, in support of H3, that ecological interactions improve forecasting performance in our time series models. 
+
+
+
+\begin{figure*}
+  \centering
+
+\includegraphics[width=\linewidth]{figures/knitr-fig_densityxgrowth-1} 
+
+\caption{Relationship between density and growth.  A 2D histogram of subreddits with overlap density (log-transformed) on the X-axis and the change in the logarithm of the number of distinct commenting users on the Y-axis.  The black line shows the marginal effect of overlap density on growth as predicted by Model 2. The gray region shows the 95\% confidence interval of the marginal effect. \label{fig:density}}
+\end{figure*}
+
+% In §\ref{sec:ecology_background} we presented H1 before RQ1 but we report results for H1 in the same section as H2 since they refer to the same regression model. 
+
+%We first present high-level findings that demonstrate advantages of our community ecology approach upon the overlap density approach. We find that accounting for commensal relationships in time-series models increases forecasting accuracy; that including subreddit average commensalism explains additional variation in subreddit over overlap density; and we compare the conclusions drawn density dependence analysis based on the correlation of overlap density and growth can lead  about the ecological environment than our analysis modeling commensal relationships between groups. Finally, we examine the distribution of \emph{average commensalism} and \emph{average absolute commensalism} to illuminate a typology of ecological communities which we illustrate through
+
+\subsection{Study A: Density Dependence Theory}
+\label{sec:res:studyA}
+
+%As discussed in §\ref{sec:ecology_background}, population ecology approaches in social computing propose that the relationship between overlap-density and growth/survival outcomes reflect an environment that may be competitive, mutualistic, or a mixture of both \citep{wang_impact_2012,zhu_impact_2014}. 
+We test the classical prediction of density dependence theory as formulated in H1 using Model 1 % (Equation \ref{eq:M1} in §\ref{methods:density}) 
+which has first- and second-order terms for the effect of overlap density on growth.  As described in §\ref{sec:ecology_background}, H1 hypothesizes that overlap density will have a curvilinear $\cap$-shaped (inverse-U-shaped) relationship with growth indicated by a positive first-order regression coefficient and a negative second-order coefficient.  
+
+\begin{table}
+  \centering
+
+% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
+% Date and time: Thu, Jul 29, 2021 - 05:22:21 PM
+\begin{tabular}{@{\extracolsep{5pt}}lccc} 
+\\[-1.8ex]\hline 
+\hline \\[-1.8ex] 
+ & Model 1 & Model 2 & Model 3 \\ 
+ Overlap density & 1.50$^{*}$ (0.26) & 1.50$^{*}$ (0.26) &  \\ 
+  Overlap density$^2$ & $-$2.08$^{*}$ (0.41) & $-$2.09$^{*}$ (0.41) &  \\ 
+  Average subreddit commensalism &  & 0.12 (0.26) & 0.11 (0.26) \\ 
+  Constant & $-$0.23$^{*}$ (0.03) & $-$0.23$^{*}$ (0.04) & $-$0.04$^{*}$ (0.01) \\ 
+ \hline \\[-1.8ex] 
+Log Likelihood & -4970 & -4970 & -4986 \\ 
+Observations & 4,090 & 4,090 & 4,090 \\ 
+\hline 
+\hline \\[-1.8ex] 
+\textit{Note:}  & \multicolumn{3}{r}{$^*$p$<0.01$} \\ 
+\end{tabular} 
+
+\caption{Loglinear regression predicting subreddit growth as a function of overlap density. The model supports the prediction of density dependence theory of a $\cap$-shaped relationship between overlap density and growth. \label{tab:density}}
+\end{table}
+
+
+As predicted, we observe a $\cap$-shaped relationship between overlap density and growth.  Figure \ref{fig:density} plots the marginal effects of  overlap density on growth for the median subreddit laid over the data on which the model is fit. Table \ref{tab:density} shows regression coefficients for Models 1-3. For about half of subreddits, increasing  overlap density is associated with higher growth rates.  The point where increasing density ceases to predict increasing growth and begins to predict decreasing growth is at the 49\textsuperscript{th} percentile. 
+Prototypical subreddits at this overlap density grew slightly (95\% CI:[0.001,0.06]).  Yet subreddits at the lower and upper extremes of overlap density slightly declined on average. Typical groups at the 20\textsuperscript{th} percentile of overlap density decline by 1.1 members (95\% CI:[-1.1,-1.15]) and typical groups at the 80\textsuperscript{th} percentile decline by 1.2 members (95\% CI:[-1.1,-1.28]). 
+While we find support for the classical theoretical prediction of a curvilinear, ($\cap$-shaped) relationship between overlap density and growth, this does not imply that relationships between highly overlapping communities are more competitive.  
+% Instead our results below % in §\ref{sec:res.characterizing} 
+% show that relationships in ecological communities of subreddits with high user overlaps are typically mutualistic. 
+
+
+\subsection{Study B: Introducing Community Ecology}
+\label{sec:res.characterizing}
+
+
+
+
+% describe the figure and the main takeaway
+% As described in §\ref{sec:characterizing.ecological.communities}, an ecological community can have positive or negative average ecological interaction §\ref{sec:mes.avg.mut} indicating if it is competitive or mutualistic and ecological interaction strength §\ref{sec:mes.abs.int}  provides a way to distinguish ecological communities with a mixture of competitive and mutualistic interactions from those where ecological interactions are weak. 
+
+Figure \ref{fig:commense.x.abs.commense} visualizes the distribution of average ecological interaction and ecological interaction strength over the 641 ecological communities we identify.  
+We observe ecological communities characterized by strong forms of both mutualism and competition, others having mixtures of the two, and some with few significant ecological interactions.  Mutualism is more common than competition, with the mean community having an average ecological interaction of 0.03 ($t=14.5$, $p<0.001$). We find that 524 clusters (81.7\%) are mutualistic. Not only are most ecological communities mutualistic, but more mutualistic ecological communities have greater ecological interaction strength (Spearman's $\rho=0.58$, $p<0.001$).
+% Note that due to our clustering procedure, our analysis examines ecological interactions among subreddits with relatively high degrees of user overlap.
+Therefore, our community ecology analysis suggests that among groups with similar users, mutualistic ecological interactions are more common than competitive ones.
+
+\begin{figure}
+
+\includegraphics[width=\linewidth]{figures/knitr-plot_commense_x_abs_commense-1} 
+
+\caption{Two-dimensional histogram showing ecological communities on Reddit in our typology.  The X-axis shows the overall degree of mutualism or competition in clusters of subreddits with high user overlap based on the average ecological interaction.  The Y-axis shows the ecological interaction strength representing the overall magnitude of competition or mutualism.}
+\label{fig:commense.x.abs.commense}
+\end{figure}
+
+
+
+\subsubsection{Example ecological communities}
+\label{sec:case.studies}
+
+We present four case studies to illustrate our typology of ecological communities of online groups. Figure \ref{fig:commense.x.abs.commense} shows that we find clusters of subreddits characterized by mutualism, competition, a mixture of mutualism and competition, and few ecological relationships at all. We select one case from each of these four types using our measures of average ecological interaction (§\ref{sec:mes.avg.mut}) and ecological interaction strength (§\ref{sec:mes.abs.int}). To allow for more interesting network structures, we draw our cases from the 367 large clusters having at least five subreddits. 
+
+\input{resources/network-figures.tex}
+
+Figure \ref{fig:networks}, presents visualizations of competition-mutualism networks representing statistically significant impulse response functions as described in §\ref{sec:mes.irf}. During our analysis, we also examined the terms of the vector autoregression parameter $\mathbf{\Phi}$, the impulse response functions, and model fits and forecasts, all of which are available in our online supplement.  We also visited each subreddit in the clusters and read their sidebars and top posts to support our brief qualitative descriptions.
+
+\subsubsection{Mutualism among mental health subreddits}
+
+% TODO, cite somebody on mental health.
+To find a case characterized by mutualism, we selected the top 37 large clusters with the greatest average ecological interaction. From these, we arbitrarily chose one interesting ecological community, the \textit{mental health} cluster, which includes 11 subreddits for supporting people in struggles with mental health, addiction, and surviving abuse.  
+Constitutive subreddits include those focused on specific mental health diagnoses like \texttt{r\Slash bpd} (bipolar disorder) and \texttt{r\Slash cptsd} (complex post traumatic stress disorder) while others like \texttt{r\Slash survivorsofabuse} and \texttt{r\Slash adultsurvivors}
+are support groups. 
+
+The interactions among these subreddits are dense and primarily mutualistic as shown in Figure \ref{fig:mut.network}. There are a handful of competitive interactions like the reciprocal competition detected between \texttt{r\Slash codedependence} and \texttt{r\Slash bpd}. We also observe some interactions that are mutualistic in one direction and competitive in the other. For example, growth in \texttt{r\Slash addiction} predicts increasing growth in \texttt{r\Slash cptsd} even as that growth in \texttt{r\Slash cptsd} predicts decreasing growth in \texttt{r\Slash addiction}. This suggests a pattern in which \texttt{r\Slash cptsd} siphons members from \texttt{r\Slash addiction}. That said, the density of mutualistic interactions shown in Figure \ref{fig:mut.network} suggests that different subreddits have complementary roles in this ecological community as people turn to different types of groups for help with interrelated problems.  While attempting to explain why different online groups form mutualistic or competitive interactions is left to future research, the example of mental health subreddits shows how groups with related topics and overlapping participants can have mutualistic interactions where growth in one predicts growth in many of the rest.
+
+\subsubsection{Competition among real estate and finance subreddits}
+
+
+To find competitive clusters, we selected from the 36 large clusters with the lowest average ecological interaction an ecological community that we label \textit{finance}. Among the 6 subreddits in this cluster, \texttt{r\Slash realestateinvesting}, \texttt{r\Slash realestate} and \texttt{r\Slash commercialrealestate} all deal in different aspects of the real estate industry, while \texttt{r\Slash financialindependence} and \texttt{r\Slash fatfire} (the acronym ``fire'' means ``financial independence/retire early'') are focused on building wealth and becoming financially independent and \texttt{r\Slash financialplanning} is a general purpose subreddit for financial advice.
+
+In contrast to the mental health ecological community, the finance cluster has mostly competitive ties as visualized in Figure \ref{fig:comp.network}. The fact that even this cluster, among the most competitive in our data, contains a number of mutualistic ties reflects just how prevalent mutualism is among subreddits with high degrees of user overlap. That said, we detect three reciprocal competitive interactions among the three subreddits that focus on real estate. The edges from \texttt{r\Slash fatfire} to \texttt{r\Slash commercialrealestate} and \texttt{r\Slash financialindependence} are competitive as well.   
+Interestingly, all interactions between the general finance subreddits (\texttt{r\Slash financialplanning} and \texttt{r\Slash financialindependence}) and \texttt{r\Slash realestate} are mutualistic.
+%Interestingly, are mutualistic.
+
+\subsubsection{Mixed interactions among timepiece subreddits}
+
+Next, we turn to an example of an ecological community with low average ecological interaction but high  ecological interaction strength.  
+We first select the 36 %(10\%) 
+large clusters with the average ecological interaction  closest to 0. To find an ecological community with a mixture of mutualism and competition, we select from the 15 clusters with the greatest ecological interaction strength from within this group and chose the \textit{timepiece} cluster containing 7 subreddits about watches. 
+
+As shown in Figure \ref{fig:mixed.network}, the ecological community of timepiece subreddits is dense with ecological interactions (although not as dense as the mental health subreddits). We observe both reciprocated mutualistic interactions, like that between \texttt{r\Slash rolex} and \texttt{r\Slash gshock}, and competitive interactions like that between \texttt{r\Slash gshock} and \texttt{r\Slash seiko}.  We also observe numerous unreciprocated competitive and mutualistic relationships like the mutualism between \texttt{r\Slash watchexchange} and \texttt{r\Slash watchcirclejerk}\footnote{The suffix is widely understood on Reddit to signify a jokey, meme, or satirical subreddit.}
+and the competition between \texttt{r\Slash japanesewatches} and \texttt{r\Slash seiko}.
+Though the average ecological interaction among these subreddits is near 0, our analysis reveals a complex ecological community with a mixture of competition and mutualism.   
+ 
+\subsubsection{Sparse interactions among Call of Duty subreddits}
+
+To find a case where ecological interactions are weak, we return to the group of the 36 %(10\%) 
+large clusters with the average ecological interaction closest to 0 but select from the 15 clusters within this group with the lowest ecological interaction strength. From these, we chose the \textit{Call of Duty} cluster containing five groups about the popular military first-person shooter series of video games.
+
+% % more quotations
+The Call of Duty ecological community is sparse, having only two significant ecological interactions among its 5 member groups. This ecological community includes subreddits about different editions of the series such as \texttt{r\Slash blackops3}, \texttt{r\Slash infinitewarfar} and \texttt{r\Slash wwii} as well as one about a popular spin-off zombie game \texttt{r\Slash codzombies} and the more general \texttt{r\Slash callofduty} subreddit. We find that  growth in \texttt{r\Slash blackops3} or \texttt{r\Slash codzombies} predicts growth in \texttt{r\Slash infinitewarfare} and no other ecological interactions. 
+
+The timepiece and Call of Duty ecological communities illustrate how subreddits with overlapping users can have relatively strong or weak forms of ecological interdependence.  Although both clusters are characterized by high degrees of user overlap and low average ecological interaction, the timepiece cluster has a dense competition-mutualism network while the call of duty network is sparse.
+    
+\subsection{Study C: Predicting Growth}
+\label{sec:res.studyC}
+
+We now compare the environmental approach of population ecology with the relational approach of community ecology.
+In Study B, we presented examples of diverse ecological communities among subreddits with overlapping members.  However, the presence of this diversity this does not mean that ecological interactions are related to the growth of online groups, the key outcome of previous ecological studies.  We therefore hypothesized that ecological interactions will improve the predictive performance of a density dependence model in H2.
+
+\subsubsection{Ecological interactions do not improve growth prediction}
+\label{sec:res.likelihood.ratio.test}
+
+To test H2, we compare Model 1, our density dependence model having first- and second-order terms for overlap density, with Model 2, which also includes average subreddit mutualism (§\ref{sec:mes.sub.mut}) as a predictor.  We also examine Model 3, in which the only predictor is average subreddit mutualism. Table \ref{tab:density} shows regression coefficients for our models. 
+
+We do not observe a statistically significant association between average subreddit mutualism and growth ($B_3=0.12, SE=0.26$).  
+% We observe that average subreddit mutualism is positively associated with growth , which makes sense as subreddits with greater average subreddit mutualism benefit more from mutualism or are hurt less from competition.
+Moreover, a likelihood ratio test comparing Model 1 and Model 2 does not support H2 as Model 2 does not predict subreddit growth better than Model 1 ($\chi^2 = 0.23$, $p>0.05$). 
+% Therefore, average subreddit mutualism does not help predict growth compared to the density dependence model alone. 
+Comparing Model 2 to Model 3 shows that overlap density explains variation that average subreddit mutualism does not ($\chi^2 = 33$, $p<0.001$). 
+%This suggests that the density of a subreddit's niche helps explain subreddit growth in important ways not captured by ecological interactions.  
+Overlap density helps explain a group's future growth, but the overall degree of mutualism or competition a group faces in its ecological community does not. 
+% In §\ref{sec:discussion}, we discuss how overlap density may only capture the hospitality of a group's environment and may be independent of mutualism and competition within its ecological community.
+
+\subsubsection{Forecasting accuracy}
+\label{sec:res.forecasting}
+
+The likelihood ratio tests in §\ref{sec:res.likelihood.ratio.test} are limited because improvements in predictive performance (or lack thereof) may be due to unobserved factors predictive of growth that are correlated with average subreddit mutualism. We hypothesized in H3 that the intergroup dependencies in our VAR models can better forecast the size of subreddits compared to baseline time series models that do not account for ecological interactions.  As described in §\ref{sec:mes.forecasting}, we test H3 by comparing two forecasting metrics: the root-mean-square-error (RMSE) and the continuous ranked probability score (CRPS).
+
+VAR models including ecological interactions have forecasting performance superior to the baseline model in terms of both RMSE and CRPS. We evaluate the 24-week forecast performance for all  subreddits which were assigned to clusters. The RMSE under the baseline model (0.84) is greater than the RMSE of the VAR models (0.75) and the CRPS of the baseline model (72,853) is also  greater than the CRPS of the VAR models (72,669).  This reflects a substantive improvement in forecast accuracy robust to the choice of the forecasting metric.  
+
+Our baseline model contains a constant term and a trend term for each group and therefore accounts for all time-invariant within-group variation.  Because overlap density is a subreddit-level variable that does not vary over time,
+we know that the improvement in forecasting performance comes from modeling ecological interactions in ways not captured by overlap density.
+
+\section{Threats to Validity}
+\label{sec:limitations}
+Our work is subject to several important threats to validity that we cannot fully address. First, we study ecological communities on only one platform hosting online groups and our results may not generalize to other platforms or time periods.
+Additionally, while our community ecology approach assumes that ecological interactions drive dynamics in the size of groups over time and cause groups to grow or decline, drawing causal inference using our method would depend on several untestable assumptions. For example, our ability to infer causal relationships might be limited if groups we do not consider---including groups on other platforms---play a role in an ecological community. Regression estimates in Models 1-3 may be confounded by omitted variables and cannot support causal interpretation. 
+Therefore, we refrain from claiming that the relationships we infer are causal.
+
+The method we propose for identifying ecological interactions between online groups has limitations common to all time series analysis of observational data. 
+Potential omitted variables might also include additional time lags of group size. Although we chose to use VAR(1) models with only 1 time lag, we hope future work can improve upon our approach and model more complex dynamics with additional lags.
+% Our results are offered as limited temporal associations consistent with inferred ecological interactions.
+Like most other time series analysis, vector autoregression assumes that the error terms are stationary. This is difficult to evaluate empirically and may not be realistic \citep{canova_var_2007}.  Future work might relax these assumptions using more complex models with time-varying parameters, state space models \citep{box-steffensmeier_time_2014}, nonlinear time series models \citep{cenci_regularized_2019, kantz_nonlinear_2003}, or stationarity-enforcing priors \citep{heaps_enforcing_2020}.  Such approaches may require additional contextual knowledge and be difficult to scale to an analysis of hundreds of different ecological communities, but may prove fruitful in future work focusing on ecological communities of interest. Such models may also be useful in future work investigating how ecological interactions change over time.
+
+Additional threats to validity stem from our use of algorithmic clustering to identify ecological communities.
+Organizational ecologists have rarely attempted to estimate the full community matrix for an entire population containing a large number of groups because of data and statistical limitations \citep[e.g.][]{ruef_emergence_2000, sorensen_recruitment-based_2004}. For instance, 100 million possible ecological interactions exist within a set of 10,000 communities.  Attempting to infer them all raises considerable computational and statistical challenges.
+% This makes it necessary to narrow the scope to the ecological communities of interest in ways appropriate to the research question.
+We chose to use a clustering analysis to explore the typical ecological communities on a platform.
+
+% Yet, a 
+
+While we choose clusters based on high degrees of user overlap and validate our clustering in terms of the silhouette coefficient and purity criteria, we might have obtained different results if we had clustered in a different way. Additionally, our efforts to obtain clusters with a high silhouette coefficient lead us to remove a large number of subreddits from our analysis. Thus, our results are not representative of Reddit overall, but only of those subreddits that were included in our analysis.  Furthermore, clustering algorithms like the one we use may not have unique solutions and different initial conditions and hyperparameters might lead to different results. While these allow us to scale up our analysis, future work should use principled definitions of an ecological community based on qualitative contextual knowledge in focused studies of particular ecological communities.
+% future investigations should also consider qualitative approaches to constructing ecological communities.
+% Finally, our three cases studies are limited in that they can offer only a proof-of-concept analysis and an enticing hint at more comprehensive future analyses with more rigorously defined populations of online groups.
+% Although we found varying results in the three ecological communities we selected, these case studies can provide little explanation for when one should expect to find different forms of commensalism in online groups. Our hope is that these initial results can point in new directions for research. 
+% % We looked at three different sets of related online groups and found three qualitatively different ecological communities.  
+% As is true in all case study research, there is little reason to expect findings from any one of our case studies to generalize to any specific other set of contexts.
+
+\section{Discussion}
+\label{sec:discussion}
+
+To introduce community ecology and compare it to population ecology, we presented three studies. In Study A, we found support for H1 showing---as predicted by density dependence theory---that overlap density has an $\cap$-shaped association with subreddit growth.
+Subreddits with moderate overlap density in our data declined less than subreddits with either very low or very high overlap density.
+According to population ecology theory, this suggests that  high-density environments are competitive and less conducive to growth than medium-density environments.
+
+%prevalence of mutualism among highly overlapping subreddits contrast with our results for
+
+Surprisingly, this contrasts with our results in Study B, where we studied the diversity of ecological communities using vector autoregression models of group size over time to infer networks of ecological interactions.
+%surveyed clusters of highly overlapping groups on Reddit to.
+We find ecological communities that are mutualistic or competitive, that mix the two, or that have few significant ecological interactions at all. Overall, however, ecological communities of subreddits are typically mutualistic and mutualistic interactions are stronger on average than competitive ones. Although we find evidence of density dependence, density-dependent competition does not necessarily reflect typical relationships in ecological communities of highly overlapping subreddits.
+
+%As discussed more below, our results are due to the fact that support for H1 does not necessarily mean that most relationships between subreddits with the greatest degrees of user overlap are competitive.
+
+Our results in Study C show that the size of the other members of an ecological community improves time series forecasts of participation in online groups. However, average subreddit mutualism did not help predict growth. 
+This suggests that population ecology and community ecology offer complementary environmental and relational perspectives.   
+Population ecology's focus on environmental factors such as niche and overlap density is useful for predicting growth, but does not provide a way to study networks of mutualism and competition.
+Community ecology unpacks density and provides insights about the specific relationships between groups.  While modeling these interactions helps forecast participation levels in groups, the existence of these interactions may be independent of future growth. For example, if mutualistic relationships are common in declining ecological communities, that would explain our result for H2.
+
+%  these interactions helps time series forecasting, but whether the interactions 
+
+% While we advance community ecology as an alternative framework to population ecology, our results show that population ecology and community ecology are complementary perspectives. 
+% We tested H2 to find out whether including subreddit average mutualism improves the ability of a density dependence model to predict the size of a subreddit n.test weeks in the future and found that it did not. Therefore,
+
+% Yet in support of H3, including ecological interactions in the vector autoregression (VAR) models substantially improves their forecasting performance. 
+ 
+
+% Our  findings in Study A and Study B may appear contradictory, their coincidence in our data points to ways in which population ecology and community ecology conceive of different kinds of ecological dynamics. 
+
+The complementary nature of the two ecologies is seen in the coincidence of our findings in Study A and Study B.   
+Indeed, these results can help explain  the puzzling set of empirical results about the relationship between overlap density and outcomes like growth, decline and survival  \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}.
+Studies of density dependence theory in social computing measure the density of an online group's niche in terms of its overlap in participants or topics. 
+%Resource overlaps seem to reflect competitive forces in some circumstances but mutualistic ones in others. 
+Our analysis clearly shows that resource overlaps between two groups might have little to do with whether they are mutualists or competitors. Instead, overlaps may simply reflect the hospitality of the environment to groups with overlapping topics or user bases.  
+As a result, the differing environmental conditions of Wikis and Usenet groups might explain why user overlap was associated with the survival of wikis \citep{zhu_impact_2014} but with the decline of Usenet groups \citep{wang_impact_2012}. Wikia was a young and growing platform during \citepos{zhu_impact_2014} data collection period when the growth of groups may have been limited by knowledge of how to build a wiki, and this knowledge was provided by overlapping experienced users. 
+Usenet was in decline during \citepos{wang_impact_2012} study period and this may have produced competitive environmental conditions as users became more scarce. 
+%Users of groups with high overlap density may have greater commitment to the platform than to any particular group and competition over such users may become fierce when a platform goes into decline. 
+
+% as users with comm
+
+% because 
+
+% and \citeauthor{tan_all_2015} \cite{tan_all_2015} observe that accounts posting in fewer different groups are more likely to leave a platform.  
+% As \citeauthor{kraut_building_2012} \cite{kraut_building_2012} argue, commitment to subgroups can enhance commitment to a broader group.  This suggests that On the other hand, members of a group with high overlap density may have little commitment to it in particular.  
+
+% This suggests that commitment to a 
+
+% We suggest that when commitment to the platform declines this may amplify competition as 
+% may present environmental conditions for strong competition over those members  
+% This suggests that 
+% Such groups may face greater challenges in sustaining participation when the platform goes into decline. 
+
+The widespread mutualism found in Study B resonates with long-held understandings of ecological interactions in evolutionary theory \citep{kropotkin_mutual_2012}.  Competition is unlikely to persist because it decreases survival. Because mutualism increases survival, it will be favored by natural selection \citep{armstrong_competitive_1980, axelrod_evolution_1981}. Similarly, competition can be avoided if groups adopt specialized roles in their ecological community, a dynamic known as resource partitioning in organizational ecology \citep{carroll_concentration_1985,menge_competition_1972,schoener_resource_1974}. Resource partitioning theory suggests that the competition among real estate subreddits observed in Figure \ref{fig:comp.network} may be due to a lack of specialization.  If specialization does not emerge over time, such groups of competing subreddits may have decreased survival. By contrast, mental health support groups like those observed in Figure \ref{fig:comp.network} appear to have distinctive purposes or roles. Future work to test such mechanisms in ecological communities of online groups may reveal ways that online groups complement or cooperate with each other.
+
+
+%Our results demonstrate population ecology's approach to competition and mutualism in a test of density dependence theory and provide an evaluation of community ecology's ability to predict subreddit growth.
+
+
+%Future work should directly test this hypothesis about the relationships between platform-based and subgroup-based commitment.
+
+% In general, competition over overlapping resources will have no effect on group growth if something besides the overlapping resource limits growth \cite{verhoef_community_2010}. For example, two wikis might share a large number of contributors (have high user overlap), but their growth might be limited by a lack of core contributors who perform important administrative tasks like policy making and software administration \cite{zhu_impact_2014}.   Community ecology relaxes the assumption that competition and mutualism are caused by user overlap density and instead seeks to infer them from data.
+% To illustrate our approach, we presented 4 example ecological communities found on Reddit §\ref{sec:case.studies}.
+Within large platforms for online groups, the great number of ecological communities that can be studied should make it possible for future work to apply methods from network science to construct and test generalizable theories about the roles of different types of resources, design features of platforms, and governance institutions in these ecological interactions. Future work should also incorporate community ecology analysis in case studies of important topics such ecological communities engaged in peer production, political mobilization, misinformation, or mental health support. 
+
+Although we focused on online groups within a single platform, groups may use multiple platforms with distinctive affordances for different purposes \citep{fiesler_moving_2020, kiene_technological_2019}. Since the VAR method relies only on time series data to infer ecological interactions, it can be applied to study ecological communities spanning social media platforms. Community ecology can thus provide a bridge between quantitative studies of participation in online groups and theories of interconnected information ecologies \citep{nardi_information_1999}. While we focus on relationships between groups sharing a platform, one can apply our concepts and methods to understand how interdependent systems of technologies and users give rise to higher levels of social organization on social media platforms \citep{astley_two_1985, aldrich_organizations_2006}. 
+
+\subsection{Implications for Design}
+    
+% While Resnick et al.~\citep{resnick_starting_2012} 
+In the final chapter of their book on \textit{Building Successful Online Communities}, \citet{kraut_building_2012} advise managers of online groups to select an effective niche and beware of competition. However, these recommendations are based on little direct evidence from studies of online groups and offer almost no concrete steps that designer or group should take based on either piece of advice. Although further research into ecological interactions is needed before design principles can be derived, we provide a framework for online group managers to think about ecological constraints on group size. 
+While intuition suggests that online group managers might seek out mutualistic relationships and avoid competitive ones, it is often not obvious whether another group with overlapping users is a competitor or mutualist. 
+Our method provides a way for group managers to know. 
+
+Competitors have a negative impact on growth, but ecological theory suggests that specialization is an adaptive strategy in response to competition \citep{aldrich_organizations_2006, carroll_concentration_1985, kraut_building_2012, powell_network_2005}. 
+%For example, the growth of Wikipedia caused other online encyclopedia projects to shift their focus \cite{hill_almost_2013}. 
+Using our method, group managers might identify competitors limiting the growth of their groups. With the knowledge of this analysis in hand, they might be able to escape a competitive dynamic by specializing. 
+While competitive relationships are defined by how they decrease the size of groups, competition can also be important to the health of the broader ecological community. Exit to an alternative group can be an avenue for political change in response to grievances and poor governance \citep{hirschman_exit_1970, frey_emergence_2019}. The threat of competition with other groups may make expressions of voice more persuasive to moderators or platforms \citep{hirschman_exit_1970}. 
+
+Groups looking to increase activity should desire to seek out mutualistic relationships, and we believe that designers of online platforms can help them do so. Features such as meta-groups, group search, recommendation engines, and practices like linking related groups may lower barriers between groups and support mutualism. However, it is not obvious to what extent particular features will support competition, mutualism, or both.  Using our method, managers and designers can test features intended to support mutualism.
+
+\section{Conclusion}
+
+% Rewrite conclusion
+While explanations for the rise or decline of online groups often look to internal mechanisms, understanding the role of interdependence between online groups is increasingly important.
+While prior research has investigated competition and mutualism among online groups with overlapping users and topics using the population ecology framework \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}, this approach does not provide a way to infer competitive or mutualistic interactions among related groups.
+We introduce the community ecology framework as a complementary perspective to population ecology. 
+% The two ecologies both seek to explain why online groups grow or survive, but they focus on different levels of analysis \cite{astley_two_1985}.  
+By inferring competition-mutualism networks directly from time-series data, our community ecology approach helps resolve the empirical tensions raised by prior ecological work in social computing and reveal that most interactions within clusters of subreddits with highly overlapping users are mutualistic. Our methods provide a foundation for future work investigating related online groups.  
+% \printbibliography[title={References},heading=secbib]
+
--- a/dissertations/nathante_uw_2021/ch4_competitive_exclusion.tex
+++ b/dissertations/nathante_uw_2021/ch4_competitive_exclusion.tex
@ -0,0 +1,933 @@
+%
+%% This is file `sample-authordraft.tex',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% samples.dtx  (with options: `authordraft')
+%% 
+%% IMPORTANT NOTICE:
+%% 
+%% For the copyright see the source file.
+%% 
+%% Any modified versions of this file must be renamed
+%% with new filenames distinct from sample-authordraft.tex.
+%% 
+%% For distribution of the original source see the terms
+%% for copying and modification in the file samples.dtx.
+%% 
+%% This generated file may be distributed as long as the
+%% original source files, as listed above, are part of the
+%% same distribution. (The sources need not necessarily be
+%% in the same archive or directory.)
+%%
+%% The first command in your LaTeX source must be the \documentclass command.
+% \documentclass[sigconf,authordraft]{acmart}
+
+
+%%%% As of March 2017, [siggraph] is no longer used. Please use sigconf (above) for SIGGRAPH conferences.
+
+%%%% As of May 2020, [sigchi] and [sigchi-a] are no longer used. Please use sigconf (above) for SIGCHI conferences.
+    
+%%%% Proceedings format for SIGPLAN conferences 
+% \documentclass[sigplan, anonymous, authordraft]{acmart}
+    
+%%%% Proceedings format for conferences using one-column small layout
+%\documentclass[acmsmall,authordraft]{acmart}
+    
+% NOTE that a single column version is required for submission and peer review. This can be done by changing the \doucmentclass[...]{acmart} in this template to 
+% \documentclass[sigconf,review=True]{acmart}
+\chapterprecishere{
+% Most explanations of changes in online group size focus on internal factors like social structures or design decisions. 
+% do not make the , and render critical questions like “which other groups are a given group's strongest competitors or mutualists?”  unanswerable.
+% TODO: Polish abstract
+% Online groups interact with each other as people, content and ideas flow among them. 
+We introduce a method for inferring competitive and mutualistic interactions between online groups from time series participation data based on the theoretical framework of community ecology. Platforms often host multiple online groups with highly overlapping topics and members. How can researchers and designers understand how interactions between related groups affect measures of group health? Inspired by population ecology, prior social computing research has studied competition and mutualism among related groups by correlating group size with degrees of overlap in content and membership. The resulting body of evidence is puzzling as overlaps seem sometimes to help and other times to hurt. We suggest that this confusion results from aggregating intergroup relationships into an overall environmental effect instead of focusing on networks of competition and mutualism among groups as our approach does. We compare population and community ecology analyses of online community growth by analyzing clusters of subreddits with high user overlap but varying degrees of competition and mutualism.
+}
+
+%%
+%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
+%% Please copy and paste the code instead of the example below.
+%%
+% \begin{CCSXML}
+% <ccs2012>
+%  <concept>
+%   <concept_id>10010520.10010553.10010562</concept_id>
+%   <concept_desc>Computer systems organization~Embedded systems</concept_desc>
+%   <concept_significance>500</concept_significance>
+%  </concept>
+%  <concept>
+%   <concept_id>10010520.10010575.10010755</concept_id>
+%   <concept_desc>Computer systems organization~Redundancy</concept_desc>
+%   <concept_significance>300</concept_significance>
+%  </concept>
+%  <concept>
+%   <concept_id>10010520.10010553.10010554</concept_id>
+%   <concept_desc>Computer systems organization~Robotics</concept_desc>
+%   <concept_significance>100</concept_significance>
+%  </concept>
+%  <concept>
+%   <concept_id>10003033.10003083.10003095</concept_id>
+%   <concept_desc>Networks~Network reliability</concept_desc>
+%   <concept_significance>100</concept_significance>
+%  </concept>
+% </ccs2012>
+% \end{CCSXML}
+
+% \ccsdesc[500]{Computer systems organization~Embedded systems}
+% \ccsdesc[300]{Computer systems organization~Redundancy}
+% \ccsdesc{Computer systems organization~Robotics}
+% \ccsdesc[100]{Networks~Network reliability}
+
+%%
+%% Keywords. The author(s) should pick words that accurately describe
+%% the work being presented. Separate the keywords with commas.
+% \keywords{datasets, neural networks, gaze detection, text tagging}
+
+%% A "teaser" image appears between the author and affiliation
+%% information and the body of the document, and typically spans the
+%% page.
+
+% \begin{teaserfigure}
+%   \includegraphics[width=\textwidth]{sampleteaser}
+%   \caption{Seattle Mariners at Spring Training, 2010.}
+%   \Description{Enjoying the baseball game from the third-base
+%   seats. Ichiro Suzuki preparing to bat.}
+%   \label{fig:teaser}
+
+% \end{teaserfigure}
+
+%%
+%% This command processes the author and affiliation and title
+%% information and builds the first part of the formatted document.
+
+
+% \fontsize{12pt}{24pt}
+% \selectfont
+
+%% We're going for a "known puzzle" + "clarifying confusion" framing
+%% Rememver to frame aronud the depvar 
+
+%% TODO: rewrite with a new outline
+%% Introduction, Related Work, Materials & Methods, Results, Discussion, Conclusions
+%% Put research question in the introduction. 
+%% Put hypotheses in Related Work.
+%% Consider Hypothesizing that mutualism will be more common than competition because subreddits in these clusters are specialized.
+%% Cut unneeded ecological terms
+%% Define needed ecological terms
+
+\section{Introduction}
+\label{sec:intro}
+
+% Why we need an ecological approach
+%Online groups are important places where people collaborate to produce information sources, engage in discussions and participate in culture.
+Although the fact is frequently ignored in social computing scholarship, online groups do not exist in isolation.\footnote{We use the term ``online group'' instead of ``online community'' to help avoid confusion with our term ``community ecology'' which plays an important conceptual and analytic role in our paper.} Indeed, although studying interdependence between online groups is different and complex \citep{hill_studying_2019}, research in social computing has sought to quantify how online groups share users or topics \citep{datta_identifying_2017, del_tredici_semantic_2018, tan_all_2015, hessel_science_2016}, and how such interactions relate to outcomes like the emergence of new groups \citep{tan_tracing_2018}, contributions to peer-produced knowledge \citep{vincent_examining_2018}, and the spread of hate speech \citep{chandrasekharan_you_2017}.  Although this work has demonstrated that intergroup interactions matter very little intergroup research has tackled questions of group success---i.e., why some online groups succeed in maintaining active and long-lived participation while most do not.
+%\citep{kraut_role_2014, resnick_starting_2012}. % commented out since there was no response
+Can intergroup relationships 
+% competition or mutualism between online groups 
+explain whether online groups will grow or decline?
+% NOTE: I guess you've added the footnote above to address the reviewer concern. It's important but (a) I think it's too early in the manuscript to bring this in and (b) it should be in a footnote. -mako
+% I moved it below by the RQ. 
+
+%a growing body of social computing research shows that online groups, such as wikis, discussion forums and mailing lists spawn new groups and wage conflicts against, compete with and help each other citep{datta_identifying_2017, tan_tracing_2018, wang_impact_2012, zhu_impact_2014}.  
+
+% individual chances of success while mutualistic dynamics increase them. 
+
+% How do relationships between groups shape their chances of success? 
+
+% What's wrong with previous ecological approaches
+% Should we introduce ecological theory in the introduction at all?
+
+Studies in social computing have drawn from organizational ecology to answer this question \citep{wang_impact_2012, zhu_impact_2014, resnick_starting_2012, zhu_selecting_2014}.  Inspired by the ecological study of biological systems, organizational ecology is an influential body of theory in sociology that studies competition and mutualism among human organizations
+% , ranging from commercial industries to social movements  \citep{hannan_population_1977, baum_ecological_2006}. 
+% NOTE: There's a jump between this sentence and the last one. I think we might need to signal, somehow, that orgecol is not puzzling or the results in soccomp are puzzling in regards to them. I've changed puzzling below to inconsistent but we should make it clear what it's inconsistent with. -mako
+Although ecological studies of firms and social movements have developed a clear and established body of theory with strong empirical support \citep{baum_ecological_2006}, similar studies of online groups have yielded inconsistent results that differ both from one context to another and from theoretical predictions. For example, wikis whose memberships overlap with other wikis survived longer \citep{zhu_selecting_2014}, but Usenet groups with overlapping memberships failed more quickly \citep{wang_impact_2012}. 
+
+% NOTE: I'm not sure conflation is the right term here. I've reworked this paragraph below -mako
+% I think you nailed it. -- nate
+We argue that these confusing results are the result of a conflation of concepts and measures from two distinct strands of theory in organizational ecology: \emph{population ecology} and \emph{community ecology}. Both define competition as a form of interdependence that \emph{decreases} growth and mutualism as one that \emph{increases} growth.  However, population ecology focuses on modeling the how overlapping resources among groups affect their subsequent growth, decline, or survival \citep{astley_two_1985, baum_ecological_2006, dobrev_dynamics_2001}. It does not attempt to directly study competitive and mutualistic interactions. On the other hand, community ecology recognizes that groups often exist within ``ecological communities,'' or clusters of highly related entities, and provides an approach for inferring competitive and mutualistic interactions among these. Although the stated goal of ecological research in social computing  has been to understand how groups influence each others' ability to sustain participation, ecological research in social computing has relied exclusively on concepts and measures from population ecology. This paper seeks to explain the puzzling set of findings in ecological social computing research by introducing community ecology.
+
+%These strands have different concepts of ecological dynamics, different levels of analysis and make distinct theoretical predictions \citep{astley_two_1985}.  
+% despite the fact that doing so is vital to 
+
+
+
+% Our contributions to CSCW are theoretical, methodological, and empirical.  
+
+% Our theoretical contribution, articulated in §\ref{sec:community_ecology},  
+
+% We then demonstrate both approaches by investigating our research question: 
+% \textit{(\textbf{RQ}) How does community ecology's view of competition and mutualism in online groups compare to that of population ecology?}
+
+% Our overarching goal is to introduce community ecology as a theoretical and methodological framework for understanding how the relationships between specific online groups shape their growth or decline. 
+
+We do so in a three-part empirical study using a dataset drawn from the 10,000 communities on Reddit with the most contributors to analyze 641 clusters of online groups with overlapping participants. 
+In Study A, we conduct the most important type of population ecology analysis, a test of what is called density dependence theory, and find support for the theory. 
+%This  suggests that competition is strongest when user overlap is high and mutualism is weakest when overlap is low. 
+This analysis suggests that high degrees of user overlap are associated with competition. 
+%VAR models are widely used in biological ecology to make inferences about competitive or mutualistic interactions between species. 
+In Study B, we introduce our method for community ecology analysis that infers networks of competitive and mutualistic interactions by using clustering analysis and vector autoregression (VAR) models of group size over time \citep{sims_macroeconomics_1980, canova_var_2007, ives_estimating_2003}. We illustrate the method in four case studies and present a large-scale computational analysis showing that mutualistic interactions are far more common than competitive ones.    
+Finally, in Study C, we bring Study A and Study B together to compare population ecology and community ecology by extending the density dependence model from Study A with a variable accounting for competition and mutualism. While we find that adding this variable does not help predict growth, including ecological interactions in our VAR models improves time series forecasting. 
+
+% importance of accounting for mutualistic and competitive interactions in predicting the growth of online groups.  We
+
+% While models including , .
+
+We discuss how these findings illuminate the differences between population ecology and community ecology and show how the two perspectives are complementary.
+While Study A suggests that competition is strongest when user overlap is high, Study B finds widespread mutualism among groups with overlapping membership.
+Although these findings might seem contradictory, they reflect how population ecology studies overlapping resources related to favorable or unfavorable environmental conditions, while community ecology studies competitive and mutualistic interactions playing out in local networks of specific groups. By demonstrating that mutualistic and competitive interactions within clusters of highly related groups are important---and by describing how to measure them---this paper lays the groundwork for future research to investigate and design for interdependence between online groups that supports their growth and success. 
+
+%we demonstrate that interactions are important and how to  inferred and are useful for time series forecasts of 
+
+% and inform design
+
+% by understanding
+
+%lays the groundwork for future research toward design
+
+% understanding how different forms of 
+
+
+
+
+
+% To answer this question,  We validate our approach by showing in §\ref{sec:res.forecasting} that
+
+% % NOTE: Is it (1) the top 1000? It would be nice to summarize the comprehensiveness here. (2) I'm ambivalent about the word "network" here. -mako 
+% We make four specific empirical contributions:   Reddit in §\ref{sec:res.characterizing} and .  
+
+% and provide an explanation for why previous ecological research in social computing has led to confusing and inconsistent results. 
+
+
+
+% NOTE: Is the sentence below correct? I guess so (at least indirectly) but I haven't read the new discussion. -mako New discussion isn't written yet, but right now that explanation is in the background section. :) -N
+
+% NOTE: cut this last sentence? -mako - I think this last sentence will be a more accurate reflection of the discussion.  -N
+%  We 
+
+%  We 
+
+% We make a theoretical contribution by introducing the community ecology perspective  that We also make a methodological contribution by providing a method for inferring these relationships from time-series data on group sizes 
+
+% Where prior approaches aggregate individual relationships between groups, our approach makes it possible to answer critical questions like ``which are a given online group's  mutualists or competitors?'' 
+
+% In the process, our theoretical work brings clarity to a confusing set of empirical results in prior research.
+
+%Discussing this seemingly contrasting finding motivates future investigations into how competitive or mutualistic ecological communities form and why some environments for online groups are competitive or mutualistic. 
+
+%  This method builds on a popular approach in biology that provides robust inferences about networks of ecological relationships. , analysis of stability, forecasts of future participation, and can scale to analyze systems of dozens of related communities. We apply this approach to four datasets. 
+
+% We validate our method using simulated data to show that it can identify a full range of ecological relationships and conduct a series of three case studies of groups hosted on the platform Reddit in \textsection \ref{sec:case.studies}. Although limited, these case studies make a third contribution in the form of empirical findings that suggest that specific patterns of relationships vary substantially across networks of groups and that mutualism appears to be much more common than competition.
+
+\section{Related Work}
+\label{sec:related.work}
+
+% One sentence on "timeliness." Find citations (Chowdry, Benkler, 
+Online groups are important sites for social support \citep{de_choudhury_mental_2014}, entertainment \citep{ducheneaut_alone_2006}, information sharing \citep{benkler_wealth_2006}, and political mobilization of disinformation campaigns and protest movements \citep{choudhury_social_2016, benkler_social_2013, krafft_disinformation_2020}.
+% knowledge of the ecosystem of online groups is important for advancing social science and informing future designs to support and manage online groups. 
+Although an online group's ability to achieve its goals depends on attracting and retaining contributors, few develop a sizable group of participants \citep{benkler_wealth_2006, dimaggio_social_2001, johnson_emergence_2014, koh_encouraging_2007, kraut_role_2014}. Many attempts to explain the success and growth of online groups look to properties of individual groups like characteristics of founders \citep{kraut_role_2014}, language use \citep{danescu-niculescu-mizil_no_2013}, turnover \citep{dabbish_fresh_2012}, and designs for regulating behavior \citep{halfaker_rise_2013, teblunthuis_revisiting_2018}.
+
+Recent research suggests that interdependence among online groups is also important to explain success and failure \citep{cunha_are_2019, kairam_life_2012, tan_all_2015, tan_tracing_2018}. 
+For example, banning hate subreddits reduced hate speech in related subreddits \citep{chandrasekharan_you_2017}. In a very different context, there is evidence that Reddit and Stack Overflow receive substantial benefits from activity on Wikipedia \citep{vincent_examining_2018}.
+% ; and editors make valuable and qualitatively different contributions across different languages of Wikipedia \cite{hale_cross-language_2015}. In addition, growth trajectories of online groups initially about similar topics can diverge \cite{zhang_understanding_2021}. 
+Our work contributes to this literature by providing a new conceptual lens and statistical method for studying competition and mutualism between online groups. 
+
+% , which theorizes how online groups depend on distinct types of resources.
+% As we discuss in §\ref{sec:rdp}, the nature of these resources makes possible conditions for mutualism or competition.  In §\ref{sec:ecology_background}, we explain how prior ecological studies of online groups extended RDT to consider how overlapping resources between communities can drive competition and mutualism and propose our first hypothesis which replicates part of these studies in Reddit, our empirical context.  Finally, in §\ref{sec:community_ecology}, we draw anew from biology and organizational ecology to present our community ecology approach and propose hypotheses to validate its usefulness for predicting the growth of online groups.
+
+\subsection{Online Groups Depend on Resources}
+\label{sec:rdp}
+
+Like prior ecological research in social computing and information systems, we build on resource dependence theory (RDT) \citep{butler_membership_2001, wang_impact_2012}. 
+\citet{butler_membership_2001} introduces
+RDT to argue that growth in online groups is driven by positive feedback as participants contribute resources such as content, information, attention, or social interactions, which motivate further contributions by subsequent participants. That said, online groups do not grow forever and RDT explains that growth is self-limiting because costs of participation increase in larger groups \citep{butler_membership_2001, butler_attraction-selection-attrition_2014}.
+
+
+% While growth far from the only criteria of success for an online group, much social computing research follows RDT by seeking to support groups' growth and survival through the attraction or retention of members \cite{koh_encouraging_2007, kraut_role_2014, cunha_are_2019}. 
+
+% For example, explanations of Wikipedia's transition from growth to decline  structures for quality assurance in a growing project that constituted barriers to newcomer participation \cite{halfaker_rise_2013, teblunthuis_revisiting_2018} spawned significant interest in designs for increasing newcomer retention that have met with limited success \citep[e.g.][]{halfaker_snuggle:_2014, morgan_tea_2013, narayan_wikipedia_2017}. Social structures like leadership, organizational practices, network structure, and design decisions can lower costs and increase benefits of participation \cite{butler_membership_2001, kraut_role_2014, tsugawa_impact_2019}. 
+
+
+%TODO: incorporate the below citations to "demonstrate that this is of importance to the social computing audience""  Also cite Charlie's paper about cross-platform interdependence
+
+%We review this foundational work in §\ref{sec:resource_dep} and then narrow our focus to prior ecological studies and other empirical work about interdependence between online groups in §\ref{sec:ecology_background}. Then, in §\ref{sec:community_ecology} we review sociological research developing community ecology theory and apply it to online groups.  
+  
+% It also builds closely on two bodies of ecological theory: first, explanations from population ecology that describe entities as sharing resources in environments and second, explanations from community ecology that theorize networks of specific community relationships.
+% In our background we introduce the first two bodies of related work in sections \ref{sec:resource_dep} and \ref{sec:ecology_background}.
+    
+    % Frame around the dependent variable: 
+    
+    % Explaining participation is important because 
+    % 1. It's a longstanding concern of the field
+    % 2. Online Groups are important to society  
+    % models 
+    % ranging from entertainment, information exchange, social interaction, to the collaborative production of knowledge and organization of collective action
+
+
+% This positive feedback between the value of prior contributions and the motivation for future contributions drives community growth.  
+% Think about the implications of our findings for the rival vs nonrival resources that could be in play.
+
+% Maybe try to deepen the discussion of resource competition, or maybe its better to avoid getting dragged into this.
+
+Ecological approaches recognize that interrelated online groups may share resources with one another in ways that constrain their growth and survival. \textit{Rival} resources like participants' time, attention, and efforts raise the possibility of competition because they become unavailable to others when used by one group \citep{benkler_wealth_2006, kubiszewski_production_2010, ostrom_public_1977,romer_endogenous_1990}. RDT suggests that declines in online participation can be explained in terms of competition over important rival resources \citep{wang_impact_2012}.
+% Online participation in general has opportunity costs and may compete with alternatives like sleep, entertainment, or work \cite{becker_theory_1965, butler_attraction-selection-attrition_2014}.
+% So online groups that provide similar benefits may be the most likely competitors because once someone has obtained satisfying benefits from one group they may go offline or switch to another activity instead of seeking similar benefits from competitor groups.\footnote{Economists refer to these as ``substitutes.' }
+
+% providing the same benefits at lesser costs might be a compelling alternative.
+% If different online groups can substitute for participation in one another and participation is rival this will lead to competition between the communities and decrease participation in both.
+% Public goods are nonrival because their usefulness is not diminished when others use them.
+
+On the other hand, online groups also rely on \textit{nonrival} resources. They can even produce connective and communal public goods like opportunities to communicate or collections of information \citep{fulk_connective_1996} which can be ``antirival'' when their usefulness increases as a result of others using them \citep{kubiszewski_production_2010, weber_political_2000}. For example, the usefulness of a communication network increases as more people join it \citep{fulk_connective_1996, katz_network_1985}. Similarly, the usefulness of an information good can increase as more people come to know, refer to, and depend upon it \citep{kubiszewski_production_2010, weber_political_2000}.
+% as when 
+%Awareness that an online group provides an audience can motivate participation  \cite{zhang_group_2011}. 
+If multiple online groups help build the same connective or communal public goods, they may form mutualistic interactions where contributions to one group may ``spill over'' and motivate participation in mutualist groups \citep{zhu_impact_2014}.   
+Ecological approaches seek to understand how different types of resources will limit or promote growth.
+% as was demonstrated when Chinese government blocked the Chinese language edition of Wikipedia, unblocked contributors decreased their participation 
+%
+
+
+%As a result, researchers, designers, and managers of online communities often set aside thorny questions of interdependence between online communities.  
+%While extensions of the resource dependence framework recognize the importance of exit from online communities \cite{butler_attraction-selection-attrition_2014}, they do not say where people go when they leave.  % Before turning to our theory of community ecology, we note differences between ecological theory and analysis in organization and biological science from  other uses of the term ecology in HCI and social computing. 
+% The term ``ecology'' often connotes interconnectedness, complexity, growth, and nature, and also crises of resource sustainability, loss, and extinction \cite{worster_natures_1994, blevis_ecological_2015}.  Most references technologists make to ``ecology'' 
+% For example Nardi and O'Day invoke the ecological metaphor in describing their vision for individuals to cultivate intentional and localized relationships with technology \cite{nardi_information_2000, bowker_bonnie_2001}.   
+% This continues a long-running intellectual exchange between social and biological sciences.  Economic thought was strongly influenced by Darwinian evolution and ecologists in biology were influenced by economic models to understand and solve problems in forestry and conservation \cite{kropotkin_mutual_2012, worster_natures_1994}. Once modern ecological science was developed it was not long before it was applied to understand human societies \cite[e.g.][]{park_human_1936, hawley_human_1986}.  Because theories of organizational ecology were crafted to address particular concerns in organization science and are laden with assumptions appropriate to traditional firms with fixed and durable boundaries, our ecological approach also draws from biology.
+
+% TODO This section needs a number of new concrete examples.  Revisit the ecological literature as well.  Also perhaps add some examples from the interview paper (which we'll cite and anonymize).    
+\subsection{Population Ecology, Density Dependence and Overlapping Resources}
+\label{sec:ecology_background}
+
+% Our theoretical approach draws from ecology. 
+While this paper focuses on the ecological study of online groups, other social computing and HCI scholars have used the term ``ecology'' (and related concepts like ``ecoystem'' and ``environment'') to denote an assemblage of sites, devices, or platforms \citep{nardi_information_1999,wang_coming_2015}. We use the term more narrowly to refer to conceptual and mathematical models of ecological dynamics. 
+In particular, our work builds on a tradition rooted in \textit{organizational ecology}. First developed in the late 1970s by sociologists studying interactions between firms, organizational ecology was inspired by, and has drawn closely from, ecological studies in biology \citep{hannan_population_1977}. 
+
+Because online groups bear similarities to traditional organizations, organizational ecology provides a compelling theoretical framework for  understanding interdependence among online groups.  It has inspired at least three high-quality empirical studies of how resources shared by online groups shared shape their growth, decline, or survival \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}.
+These studies draw from the \textit{population ecology} strand of organizational ecology
+%, while we introduce \textit{community ecology} as an alternative. 
+that studies ecological dynamics within a population of groups. In organizational ecology, populations have been defined as sets of organizations sharing an organizational industry or business model \citep{hannan_organizational_1989}.  In social computing, populations have been defined as online groups sharing a given social media platform \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}.
+
+While population ecology involves several distinct theoretical propositions, \textit{density dependence theory} (DDT) is perhaps the most prominent and is the subject of all three prior ecological studies of online groups \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}.  DDT models competitive or mutualistic forces in a population of groups as a function of \textit{density} which, in the earliest and most influential studies of DDT, is simply the size of the population. In this way, DDT assumes that every group in the population is facing the same competitive and mutualistic pressures \citep{aldrich_organizations_2006}.
+However, online groups sharing a platform have diverse topics \citep{kairam_life_2012}, norms \citep{chandrasekharan_internets_2018, fiesler_reddit_2018}, and user bases \citep{tan_all_2015}. Because groups sharing few resources are unlikely to be strongly interdependent, ecological studies of online groups have modeled density dependence based on the concept of \emph{overlap density} \citep{baum_ecological_2006, dobrev_dynamics_2001, wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}. Rather than the number of groups that exist in a population, overlap density measures the extent to which an one group's members or topics overlap with all other groups'. Overlap density thus characterizes a group's \emph{niche} or local \emph{resource environment} defined by its distinctive topic and membership.
+
+
+%Unlike \citet{datta_identifying_2017}, we do not divide user frequency by the number of subreddits where the user appears because we do not wish to assume that users who comment in many subreddits are less ecologically important.
+
+%Overlap density is thus not a property of a population of groups, but a property of the resource environment a particular group faces. 
+
+
+% While foundational studies of density dependence in organizational research measu
+% red density and growth at the population level, ecological studies of online groups .\footnote{Although it is less common in organizational research, overlap density has also been used by some organizational ecologists \cite[e.g.][]{dobrev_dynamics_2001}.}  
+% Are  this paragraph and the next one necessary or just confusing?
+DDT proposes a model for the growth of organizational populations that has a similar structure to \citet{butler_membership_2001} RDT model for the growth of online groups.
+In DDT, mutualism is the engine of positive feedback driving population growth. Organizational ecologists show how successful organizations in an emerging industry develop nonrival resources like the legitimacy of a business model or industrial know-how that attract new organizations to enter the market \citep{carroll_density_1989,hannan_organizational_1989}. Similarly, a population of online groups, such as those sharing a platform, may grow in size as their platform gains in popularity, as established groups spin off new ones, and as useful knowledge develops that can be shared between groups \citep{tan_tracing_2018, zhu_impact_2014}.
+
+
+% TODO add a footnote to show the analytical equivalence between the models and connection to Malthus.  
+In RDT, growth of online groups is self-limiting because of the challenges in managing large groups \citep{butler_membership_2001}. In DDT, competition among population members over rival resources limits growth \citep{hannan_organizational_1989}. DDT thus proposes a trade-off in which low density reflects limited opportunities for mutualistic contributions of nonrival resources like legitimacy, connectivity, and knowledge, but high density reflects competition over rival resources.  
+Therefore, DDT predicts that the relationship between density and positive outcomes like growth or survival is  $\cap$-shaped (inverse-U-shaped) \citep{baum_ecological_2006, carroll_density_1989}.
+
+% Save the potential conflict between RDT and DDT for the discussion
+% An individual online group's growth may be limited by the ability of their social structures to scale to include more members  \citep{butler_membership_2001}  or due to competition with other groups over members \citep{hannan_organizational_1989}.
+
+%In a homogenous population or in cases where litt
+%Population ecologists have used a number of definitions of population, but they often refer to sets of organizations having the same organizational form or business model. 
+
+%This is because many environments present a trade-off between mutualism and competition:  mutualistic forces are stronger when density is low and competitive forces are stronger when density is higher.  The intuition is that low-density environments reflect poor environmental conditions for success---if conditions were good then they would attract more growing communities hence be more dense. On the other hand, high-density environments are thought to become crowded and competitive \citepp{hannan_organizational_1989}.
+
+Tests of DDT in populations of online groups yield inconsistent results. In \citet{wang_impact_2012}, user overlap in Usenet newsgroups is associated with decreasing numbers of participants. Similarly, \citet{teblunthuis_population_2020} find that topical overlaps between online petitions are negatively associated with participation. By contrast, \citet{zhu_impact_2014} find that membership overlap is positively associated with increasing survival of new Wikia wikis. Only \citet{zhu_selecting_2014} find support for the $\cap$-shaped relationship predicted by DDT in an enterprise social media platform.
+
+In Study A, we provide a test of DDT using data from Reddit. The classical logic of DDT appears reasonable in the context of Reddit because low overlap density is likely to reflect an impoverished environment lacking in non-rival resources like skills and knowledge of experienced users, while a group with high overlap is likely to face competition over its members \citep{zhu_selecting_2014, zhu_impact_2014}:
+\textit{(\textbf{H1}) The relationship between overlap density and the growth of online groups is  $\cap$-shaped (inverse-U-shaped).}
+% such as the 
+
+%DDT sees competition and mutualism as environmental properties of an online group's niche. 
+
+DDT proposes that very high levels of density will decrease growth because of increasing forces of competition within a niche. However, to conclude that groups with the greatest membership overlap are likely competitors would be to commit a well-known statistical fallacy
+% (the term ecological fallacy does not refer to theories of population or community ecology, but rather to ``ecological correlations,'' meaning correlations involving aggregates)
+\citep{piantadosi_ecological_1988, robinson_ecological_1950}.
+The density of a group's environment suggests that it faces competition or mutualism, but it does not tell us which overlapping communities are competitors and which are mutualists.
+% DDT therefore relates resource overlaps to the growth of online groups, yet stops short of inferring competitive or mutualistic interactions among them. It does not provide a way of learning when and why groups are mutualists or competitors and this limits its ability to inform designs that take these interactions into account. 
+Community ecology overcomes this limitation of DDT.
+
+\subsection{Introducing Community Ecology \label{sec:community_ecology}}
+
+Perhaps the most natural way to understand the distinction between population ecology and community ecology is in where they believe ecological dynamics like competition and mutualism play out \citep{astley_two_1985}.   While population ecology locates competition and mutualism within an environmental niche, community ecology locates competition and mutualism in networks of interdependent groups called \emph{ecological communities} \citep{aldrich_organizations_2006}. In organizational ecology, this can mean studying interactions between different organizational populations \citep[e.g.][]{sorensen_recruitment-based_2004, mcpherson_ecology_1983}, or networks of interactions between organizations \citep[e.g][]{powell_network_2005, margolin_normative_2012}. 
+%Doing so makes visible the distinctive roles that particular groups play. 
+While varying conceptions of community ecology are found in the organizational ecology literature \citep{freeman_community_2006}, the approach we describe is identical in structure to that taken by \citet{aldrich_organizations_2006} and \citet{hawley_human_1986}.
+
+Community ecology focuses on \emph{ecological interactions} \citep{aldrich_organizations_2006}.
+%In organizational ecology, these interactions are referred to as ``commensal relationships.'' However, biologists use the term  ``commensal'' quite differently  to mean an unreciprocated mutualistic interaction in which one species provides benefits to another while being unaffected by it.  While for the most part, we draw our conceptions and terminology from organizational ecology rather than biology, the use of the term ``commensalism'' in organizational ecology can be confusing.  We therefore adopt the term ``ecological interaction.''
+Ecological interactions can be mutualistic when one group has a positive influence on the second such that growth in the first group leads to growth in the second.  They can also be competitive if one group has a negative effect on the second such that growth in the first group leads to decline in the second. Ecological interactions can be reciprocated if mutualism (or competition) from one group to another group is returned in kind. An ecological interaction can also be mutualistic in one direction and competitive in the other.  The competitive or mutualistic interactions in an ecological community are quantified by the \emph{community matrix}, a central analytical object in community ecology in both biology and organization science \citep{verhoef_community_2010, novak_characterizing_2016, aldrich_organizations_2006}. 
+
+In Study B, we demonstrate community ecology by inferring networks of ecological interactions in ecological communities on Reddit. Because our understanding of community ecology theory does not suggest hypotheses about what we will find, we conduct an exploratory data analysis to determine whether mutualism or competition among subreddits is more common on Reddit and present case studies illustrating the types of ecological communities we identify.
+
+%So a commensal relationship exists between each pair of groups in an ecological community.
+
+% There are six possible ecological interactions as described in Table \ref{tab:interaction.types}.  Note that they can be reciprocal (as in full mutualism and competition) or not (as in partial mutualism and competition). In our framework ``predation'' is an interaction that is positive in one direction but negative in the other. It is also possible that growth or decline in the first group has no effect on the second group, and visa-versa, a situation termed ``neutrality.''
+
+
+% \begin{table}
+%     \caption{The five possible ecological interactions between two online groups. Values in the column ``i $\rightarrow$ j'' represent the sign of $\phi_{i,j}$ group i's effect on group j.  Based on table 11.1 from \citet{aldrich_organizations_2006}.}
+%     \centering
+%     \begin{tabular}{c|c|c}
+%     i $\rightarrow$ j ($\phi_{i,j}$)& i $\rightarrow$ j  ($\phi_{i,j}$) & Interaction type  \\ \hline
+%     $+$ & $+$ & Full mutualism \\
+%     $+$ & $\cdot$ & Partial mutualism \\
+%     $+$ & $-$ & Predation \\
+%     $-$ & $\cdot$ & Partial competition \\
+%     $-$ & $-$ & Full competition \\
+%     $\cdot$ & $\cdot$ & Neutrality
+%     \end{tabular}
+%     \label{tab:interaction.types}
+% \end{table}
+
+% by conceiving of community ecology as the study of relationships between different groups.
+
+% Relationships studied in community ecology are defined by how they , but they are also important because networks of relationships 
+%and give rise to higher-order properties like stability. 
+
+%Our community ecology approach instead focus on relationships between communities from overlap density approaches to focuses on relationships between communities as a step toward solving the puzzle. 
+
+%Consider the example of how \citet{zhu_impact_2014} find membership overlap is associated with increasing survival of new Wikia wikis, but in \citepos{wang_impact_2012} study of Usenet groups user overlaps are associated with decreasing group sizes. 
+
+% Consider cutting this since we don't look at any other factors
+
+%study period,  and they found a stronger relationship when overlapping members were from more established groups. Perhaps the growth Wikia wikis was limited by knowledge of how to build a Wiki which was provided by more experienced users and user overlaps were correlated with access to such knowledge.  While 
+
+
+% What's the point of these three paragraphs?
+\subsection{Predicting Growth}
+
+In Study C we build upon our analyses from Study A and Study B by testing whether community ecology can explain the growth and decline of online groups in ways that population ecology can not. We do this by analyzing in two different ways whether accounting for ecological interactions helps predict future group sizes.  
+% We expect it to do so because resource overlaps as modeled by DDT may be a poor proxy for the degree to which a group's environment is competitive or mutualistic. 
+In general, competition for overlapping resources will have no effect on group growth if something besides the overlapping resource limits growth \citep{verhoef_community_2010}. For example, two wikis might share a large number of contributors (they have high user overlap), but their growth might be limited by a lack of core contributors who perform important administrative tasks like policy making and software administration \citep{zhu_impact_2014}.   Community ecology relaxes the assumption that competition and mutualism are caused by user overlap density and instead seeks to infer these relationships from data.  We test the importance of this conceptual shift for predicting growth by testing two hypotheses. The first uses a model comparison approach to  test if adding a measure of ecological interactions to the density dependence model in Study A improves prediction of growth: (\textit{\textbf{H2})  A model with ecological interactions and density dependence predicts growth in online groups better than density dependence alone.}
+
+Support for H2 may be a relatively low bar for assessing whether ecological interactions are important factors shaping the growth of online groups because of confounding moderator or mediator variables related to the occurrence of ecological interactions.
+% For example, suppose mutualistic interactions were correlated with declining ecological communities.
+Therefore, we also use a time series forecasting approach to test whether modeling ecological interactions is useful for making time series forecasts of participation in online groups:
+%We seek to demonstrate in  whether including commensal relationships in time series forecasting models improves forecasting performance.  
+(\textit{\textbf{H3}) The addition of ecological interactions to a baseline time series model improves the forecasting performance.}
+While this does not directly compare population ecology and community ecology, it validates that ecological interactions are important.
+
+%With commensalism, we can seek to explain the puzzling results of resource overlap studies by exploring our second research question:\noindent \textbf{RQ2: How are degrees of user overlap and types of commensal relationships related?} 
+
+% This paragraph isn't helping very much
+% Ecological dynamics play out through the network of such relationships over time as represented by the \emph{community matrix}, $\Phi$.  
+
+
+% Analysis of the community matrix can reveal indirect relationships between groups and properties of an ecological community like stability \cite{ives_estimating_2003}. 
+%Seeing interdependence between online groups through a community ecology-based network of dynamical relationships can make visible special roles that particular groups play in an ecological community through their many mutualistic or competitive relationships.
+
+% Next we take a first methodological step toward answering questions like these by adapting vector autoregression models from biology and macroeconomics as an approach to inferring community matrices.  We then apply our approach in three case studies of related groups hosted on Reddit to reveal three qualitatively different ecological communities. 
+
+%% SOME BIKERACK RAISING MORE ISSUES WITH THE NICHE OVERLAP APPROACH
+
+% study online groups additionally shifts from an analogy of online communities as individual members of a biological species to online communities as species themselves and seeking to understand functional relationships between different online groups. 
+% Yet a closer examination of the analogy to density-dependence in organizational or biological populations reveals conceptual awkwardness.  At issue is the referent of the term ``niche.''  Should we use ``niche'' to refer to a set of resources that an online community can utilize?  This is what ``niche'' means in both overlap density and in our version of community ecology.
+
+% Social exposure is also important, but we don't deal with that in this .  The idea here is that the cost-benefit structure depends on alternatives which can lower costs or . 
+%VAR analysis can quantify the stability of the system and affords exploration of counterfactual forecasts to simulate hypothetical interventions \citep{ives_estimating_2003}.  
+
+
+\section{Materials \& Methods}
+\label{sec:methods}
+
+
+
+% The presentation of our materials and methods is organized as follows: First we introduce the methods and measures for Study A, beginning with 
+% \emph{user overlap} %(§\ref{sec:mes.overlap})
+% which is aggregated into \emph{overlap density} %(§\ref{sec:mes.density}) 
+% to predict subreddit \textit{growth} %(§\ref{sec:mes.growth}) 
+% in a loglinear regression model. Then, for Study B, we present 
+% our clustering procedure for identifying ecological communities % (§\ref{sec:clustering}) 
+% on which we fit VAR models % (§\ref{sec:var}) 
+% predicting \emph{group size}. % (§\ref{sec:mes.group.size}). 
+% To explore the types of ecological communities found on Reddit, we derive two measures from these models for each cluster: \emph{average ecological interaction} 
+%(§\ref{sec:mes.avg.mut}) 
+% which quantifies the degree of competition and mutualism in the ecological community and \emph{ecological interaction strength} %(§\ref{sec:mes.abs.int}) % which quantifies its overall intensity of ecological interactions. Next, we draw competition-mutualism networks in example ecological communities based on interpreting the VAR models using impulse response functions (IRFs) %(§\ref{sec:mes.irf}). 
+% Then, in Study C, we test H2 to compare community ecology and density dependence theory by adding \emph{subreddit average mutualism} %(§\ref{sec:mes.sub.mut}) 
+% to the model from Study A. Finally, we test H3 by evaluating whether including ecological interactions in the VAR models improves time series forecasting. % (§\ref{sec:mes.forecasting}).
+
+\subsection{Data}
+
+Our data are drawn from the publicly available Pushshift archive of Reddit submissions and comments which we obtained from December 5\textsuperscript{th} 2005 to April 13\textsuperscript{th} 2020
+\citet{baumgartner_pushshift_2020}. Within this dataset, we limit our analysis to submissions and comments from the 10,000 subreddits with the highest number of comments. There are 702 subreddits larger than the smallest subreddit included in our dataset having a majority of submissions marked ``NSFW,'' which typically indicates pornographic material. As others have done in large-scale studies of Reddit \citep[e.g.,][]{datta_identifying_2017}, we exclude these subreddits to avoid asking members of our research team to inspect clusters including pornography. The top 10,000 subreddits provide a sufficiently large number of ecological communities for our statistical analysis. 
+
+\subsection{Study A: Density Dependence Theory} % and Community Ecology}
+\label{methods:density}
+
+
+\subsubsection{User overlap \nopunct} \label{sec:mes.overlap} 
+ $o_{i,j}$ quantifies the degree to which two subreddits ($i$ and $j$) share users. 
+ %From it we construct clusters of related groups in §\ref{sec:clustering} and quantify overlap density in §\ref{sec:mes.density}.  
+\citet{zhu_impact_2014} and \citet{wang_impact_2012} both measure user overlap between two groups by counting the number of users contributing to both groups at least once and exclude users who appear in more than 10 groups. In our preliminary analysis, we found that this measure led to similarity measures and clusters with poor face validity.  These issues may have stemmed from how Reddit users often peripherally participate in many groups while participating heavily in few \citep{tan_all_2015, hamilton_loyalty_2017, zhang_community_2017}. Therefore, our measure of user overlap follows  \citet{datta_identifying_2017} by using the number of comments each user makes in each pair of groups.
+
+To measure user overlap between subreddits, we first build user frequency vectors by counting the number of times each user comments in each subreddit. We prevent giving undue weight to subreddits with higher overall activity levels by normalizing the comment counts for each subreddit by the maximum number of comments by a single author in the subreddit:
+
+\begin{equation}
+    f_{u,j} = \frac{n_{\mathrm{u,j}}}{max_{v\in\mathrm{J}}n_{v,j}} \label{eq:user.frequency}
+\end{equation}
+
+\noindent where $n_{u,j}$,  the user frequency, is the number of times that user $u$ authors a comment in subreddit $j$.
+
+This results in a user frequency vector $F_j$ for each subreddit that is sparse and high-dimensional, having one element for each user account that comments in any subreddit in our dataset.
+% In the course of developing our clustering analysis described in §\ref{sec:clustering}, we found that following an approach analogous to latent semantic analysis (LSA) improved the quality of our clusters. 
+Next, we use LSA to reduce the dimensionality of the user frequency vectors. 
+LSA is based on the singular value decomposition and is common in natural language processing and information retrieval. LSA preserves subreddit similarities while removing noise and dealing with sparsity \citep{dumais_latent_2004}:
+
+\begin{align}
+    \mathbf{F} &= \mathbf{U \Sigma V}^T \\ \nonumber
+    \widetilde{F_{j}} &= \mathbf{U_k}^TF_j \label{eq:user.frequency.svd}
+\end{align}
+
+\noindent $\mathbf{F}$ is the matrix where columns are author frequency vectors $F_j$ and $\mathbf{U \Sigma V}^T$ is its singular value decomposition. Truncating the singular value decomposition to use only the first $k$ left-singular vectors gives $\mathbf{U_k}$. Left-multiplying a subreddit's author frequency vector by $\mathbf{U_k}$ transforms the high-dimensional author frequencies into $\widetilde{F_j}$, their approximation in the $k$-dimensional space. 
+% We choose $k=600$ in the course of our grid search for a good clustering described below in §\ref{sec:clustering}.
+
+%clustering with a high silhouette coefficient.
+
+We then obtain our measure of \textit{user overlap} by taking the cosine similarities between the resulting vectors for a pair of subreddits:
+\begin{equation}
+    o_{i,j} = \frac{\widetilde{F_{j}} \cdot \widetilde{F_{i}}} {\norm{\widetilde{F_i}} \norm{\widetilde{F_j}}} \label{eq:user.overlap}
+\end{equation}
+
+\noindent where $\norm{\widetilde{F_i}} = \sqrt{\sum_{x=1}^k \widetilde{f_{x,i}}^2}$ is the euclidean norm of the transformed user frequencies for subreddit $i$.  
+
+
+
+
+%We use the following methods and measures in our tests of our hypothesis that the relationship between user overlap density the growth of online groups is $\cap$-shaped (H1) and our hypothesis that accounting for ecological interactions will help explain growth beyond overlap density (H2):
+
+% We measure \emph{overlap density} and \emph{growth} to and . To test \textit{\textbf{H2}}, we add the overall influence of ecological interactions on a subreddit 
+
+\subsubsection{Growth\nopunct}\label{sec:mes.growth} is the dependent variable in our density dependence model testing H1 and is also used in our test of H2 as part of Study B. Growth is measured as the change in the (log-transformed) size of a subreddit over the final 24 weeks of our data, from to November 4\textsuperscript{th} 2019 to April 13\textsuperscript{th} 2020. 
+
+\subsubsection{Overlap density\nopunct} \label{sec:mes.density} $d_i$ is the normalized average user overlap for a given subreddit. It is the independent variable in our density dependence model testing H1:
+
+\begin{align}\label{eq:user.overlap.density}
+  d^*_{i} &= \frac{1}{\left|S\right|-1} \sum_{j\in R;j\ne i} \mathrm{o}_{i,j} \nonumber \\
+  d_{i} &= \frac{d_i^*}{\mathrm{max}_j d_j^*}
+\end{align}
+
+\noindent where $S$ is the set of groups in our dataset.  
+
+\subsubsection{Regression model for H1} \label{sec:reg.H1}
+To test H1, we fit Model 1 % in Equation \ref{eq:M1}
+which has first and second-order terms for overlap density to allow for a curvilinear relationship between \emph{overlap density} and \emph{growth}.
+\begin{align}
+\mathrm{Model~1} & & Y_i = B_0 + B_1 d_{i} + B_2 d^2_{i}  \label{eq:M1}
+\end{align}
+\noindent where $Y_i$ is the growth of subreddit $i$ and $d_i$ is its overlap density.
+
+
+\subsection{Study B: Introducing Community Ecology}
+
+
+%Here we review the prior work on which we build our methodological approach to inferring competitive and mutualistic relationships between online groups. %\textsection \ref{sec:inferring} describes our own methodological contributions. 
+
+\subsubsection{Clustering to identify ecological communities}
+\label{sec:clustering}
+Analyzing networks of ecological interactions is the key difference between community ecology and population ecology. 
+% In Study A we set out to survey the types of ecological communities found on Reddit to provide a comparison with a large-scale population ecology analysis.
+% in \ref{sec:clustering}
+%Here, we use a heuristic approach based on clustering algorithms to find ecological communities of online groups that all have high user overlap.
+To identify ecological communities of related subreddits, we use a clustering procedure based on the user overlap measure described above in §\ref{sec:mes.overlap}.  
+We selected a clustering model using grid search to obtain a high silhouette coefficient \citep{rousseeuw_silhouettes_1987}. The silhouette coefficient captures the degree to which a clustering creates groups of subreddits with high within-cluster similarity.
+% relative to similarity with subreddits in other clusters. 
+
+Our description of our measure for user overlap in §\ref{sec:mes.overlap} does not explain how we choose the number of LSA dimensions $k$. 
+To do so, we ran the affinity propagation \citep{frey_clustering_2007}, HDBSCAN \citep{mcinnes_hdbscan_2017} and \textit{k}-means clustering algorithms and selected the algorithm, hyperparameters, and LSA dimensions $k$ that resulted in the clustering with a high silhouette coefficient having less than  5,000 isolated subreddits, and at least 50 clusters.  We limit the number of isolated subreddits because some choices of hyperparamters for the HDBSCAN algorithm could improve the silhouette coefficient, but at the cost of greatly increasing numbers of isolated subreddits.  Choosing a relatively high limit to the number of isolates helps ensure that our clusters contain highly related communities. We chose an HDBSCAN clustering with 731 clusters, 4964 isolated subreddits, $k=600$ LSI dimensions, and a silhouette score of 0.48.    
+We exclude the isolated subreddits from our analysis. More details about our clustering selection process are found in the online supplement.
+
+
+%In order to test H2 and answer RQ1, we estimate the community matrix of commensal relationships between selected communities of online groups. 
+We evaluate the external validity of the chosen clustering using the purity evaluation criterion \citep{manning_introduction_2018}
+% :
+% \begin{equation}45
+%     \mathrm{Purity}=\frac{1}{N}\sum_{m\in M}\max_{d\in D}{|m \cap d|}
+% \end{equation}
+% \noindent Where $N$ is the number of clusters $M$, $D$ are ``true'' classes to which subreddits might belong and $max_{d\in D}|m \cap d|$ is the greatest number of subreddits in cluster $m$ that belong to the same class $d$.
+To do so, an undergraduate research assistant examined a random sample of 100 clusters including 744 subreddits.  By visiting the subreddits and using her own judgment, the assistant flagged subreddits that did not seem like a good fit for their assigned cluster. Using these labels and excluding 25 subreddits that have been deleted, made private, or banned, we calculated the purity of our clustering as 0.92. This means that we believe that 92\% of subreddits belong to their assigned cluster.
+% Note that although we clustered subreddits based on user overlap, we obtain a high purity score based on a subjective evaluation of the subreddits' contents. 
+
+%\subsection{Inferring Mutualistic and Competitive Interactions}
+
+% We find f(N.clusters) clusters and f(N.isolates) isolated subreddits. The median cluster has median.cluster.size subreddits and the largest cluster has 
+
+
+\subsubsection{Group size\nopunct} \label{sec:mes.group.size} is the dependent variable of the models we use to infer ecological interactions. Measured as the number of distinct commenting users in a subreddit each week, group size quantifies the number of people who participate in a subreddit over time. Typical of social media participation data, group size is highly skewed. Therefore, we transform it by adding 1 and taking the natural logarithm. 
+
+
+% The following three paragraphs probably belong in the methods section, but I'm trying to satisfy the reviewers.
+\subsubsection{Inferring ecological interactions using Vector Auto Regression}
+\label{sec:var}
+
+The community matrix $\mathbf{\Phi}$ of ecological interactions can be inferred from time series data using vector autoregression models (VAR models). VAR models are a workhorse in biological ecology because VAR(1) models (i.e., VAR models with a single autoregressive term) have a close relationship with the Gompertz of population growth which is widely used in ecology \citep{ives_estimating_2003}. Even in the presence of unmodeled nonlinearities, VAR(1) models can reliably identify competition or mutualism in empirically realistic scenarios \citep{certain_how_2018}. VAR models also been widely adopted in the social sciences, particularly in political science and in macroeconomics \citep{box-steffensmeier_time_2014}. 
+
+% \citet{sims_macroeconomics_1980} advocated VAR modeling in macroeconomics to address a problem in the field as an alternative to structural equation modeling (SEM), which required detailed specification of a large number of theoretical assumptions to identify. 
+%similar to structural equation models but require fewer theoretical assumptions but are
+%VAR models are flexible enough to model a wide range of systems so long as sufficiently long time-series data are available \citep{sims_macroeconomics_1980}.
+VAR(1) models can be intuitively understood as a generalization of auto-regressive AR(1) models in time series analysis. But while AR(1) models predict the state of a single time series as a function of its previous value, VAR(1) models simultaneously predict multiple time series as a function of the values of every other variable in the system \citep{canova_var_2007, ives_estimating_2003}:
+
+\begin{equation}\label{eq:var1}
+Y_t = B_0 + B_1t + \sum_{k \in K}A_k x_{k,t} + \sum_{j \in M}\Phi_{j} y_{j,t-1} + \epsilon_t
+\end{equation}
+
+\noindent where $Y_t$ is a vector containing the sizes of a set of online groups ($M$) at time $t$. $B_0$ is the vector of intercept terms and $B_1$ is the vector of linear time trends ($b_{1,j}$) for each community ($j$). $\Phi_{j}$ represents the influence of $y_{j,t-1}$, the size of the $j^{\mathrm{th}}$ online group at time $t-1$ on $Y_t$. $\Phi_{j}$ is a column of $\mathbf{\Phi}$, a matrix of coefficients in which the diagonal elements correspond to intrinsic growth rates (marginal to the trend) for each online group and the off-diagonal elements are intergroup influences, and $\epsilon_t$ is the vector of error terms
+
+Additional time-dependent predictors ($x_{k,t}$) can be included in the vectors $X_{k}$ with coefficients $a_k$. Because subreddits are created at different times, growth trends must begin only after the subreddit is created. We use $X_{k}$ to introduce a  counter-trend during the period prior to the creation of subreddits so that each group's growth trend begins in the period the group is created. For each group $j$ created at time $t^0_j$ we fill $X_{j}$ with the sequence $[1,2,3,\ldots\ ,t^0_j-1,0,0,0,\ldots\ ]$. In other words, $X_{j}$ adds a counter-trend only during the period prior to the first comment in subreddit $j$. We fix the elements $a_{j,i}$ of $A_j$ equal to 0 unless $i=j$, so the counter trend only influences subreddit $j$. This effectively sets $a_{j,j}$ approximately equal to $-b_{1,j}$. 
+
+We fit VAR(1) models using ordinary least squares as implemented in the \texttt{vars} \texttt{R} package to predict the group size each week using over the history of each subreddit prior to November 4\textsuperscript{th} 2019 \citep{pfaff_var_2008}. We hold out 24 weeks of data for forecast evaluation and fit our models on the remainder. To ensure that sufficient data is available for fitting the models, we exclude 946 subreddits and 89 clusters having less than 156 weeks of activity. 
+
+% where the cluster data lacks the necessary degrees of freedom to fit the model because the length of the training time series is less than the size of cluster plus 2. 
+
+
+% We hold out the weeks from fit.date to to.date for evalution. % Some of the clusters were too large or had too low levels of activity We include only We include a vector of intercept terms (to account for different equilibrium community sizes) and a vector of trends (to account for long-run endogenous growth) because we found that including these terms greatly improved the fit of our models to the data. Our VAR(1) models have this form in vector notation:
+
+%$$ Y_t = \Mu + \Phi_1 Y_{t-1} + \ldots + \Phi_p Y_{t-p} + \epsilon_t $$ 
+% TODO: avoid mixing matrix and vector notation.
+
+\subsubsection{Characterizing ecological communities}
+\label{sec:characterizing.ecological.communities}
+
+In Study B, we interpret the community matrix $\mathbf{\Phi}$ as a directed network of ecological interactions, a \emph{competition-mutualism network} \citep{ives_estimating_2003}. Although the elements of $\mathbf{\Phi}$ correspond to direct associations between group sizes \citep{novak_characterizing_2016}, ecological interactions can also be indirect. Consider 3 one-directional interactions between three groups ($a$, $b$, $c$) such that growth in $a$ predicts decreased growth in $b$ ($\phi_{a,b} < 0$), growth in $b$ predicts decreased growth in $c$ ($\phi_{b,c} < 0$), but $a$ and $c$ do not directly interact ($\phi_{a,c} \approx 0$).
+
+This does not necessarily mean that groups A and C are independent. Rather, an exogenous increase in A predicts a decrease in B and thereby an eventual increase in C.  Such indirect relationships are analyzed by using impulse response functions (IRFs) to interpret a VAR model \citep{box-steffensmeier_time_2014}.  In large VAR models containing many groups, the great number of parameters can mean that few specific elements of $\mathbf{\Phi}$ will be statistically significant, even as many weak direct relationships can combine into statistically significant IRFs \citep{canova_var_2007}. 
+
+\subsubsection{Average ecological interaction\nopunct} \label{sec:mes.avg.mut}  $\overline{m}$ measures the extent to which an overall ecological community is mutualistic or competitive by taking the mean point estimate of the off-diagonal coefficients of $\mathbf{\Phi}$:
+
+\begin{equation}\label{eq:average.interaction}
+\overline{m} = \frac{1}{\left|M\right| - 1} \sum_{i\in M} \sum_{j\in M;j\ne i} \phi_{i,j}
+\end{equation}
+
+\noindent if $\overline{m} > 0$ then mutualistic interactions within the ecological community are stronger than competitive ones, and if $\overline{m} < 0$ then competitive interactions are stronger then mutualistic ones.
+
+\subsubsection{Ecological interaction strength\nopunct} \label{sec:mes.abs.int} $\kappa$ quantifies the overall strength of ecological interactions in an ecological community as the mean absolute value of the point estimates of the off-diagonal coefficients of $\mathbf{\Phi}$:
+
+\begin{equation}\label{eq:average.absolute.interaction}
+\kappa = \frac{1}{\left|M\right| - 1} \sum_{i\in M} \sum_{j\in M;j\ne i} \left| \phi_{i,j} \right|
+\end{equation}
+
+\noindent where $\left| \phi_{i,j} \right|$ is the absolute value of the coefficient $\phi_{i,j}$.
+
+Ecological communities of subreddits with overlapping users vary in both the overall strength of ecological interactions and in the overall degree of mutualism and competition between member groups.  If an ecological community's average ecological interaction is positive, we say the ecological community is mutualistic.  If it is negative, we say the ecological community is competitive. The average ecological interaction can be close to 0 in two ways. First, the ecological interaction strength can simply be low.  Alternatively, the ecological community can have a mixture of competitive and mutualistic interactions that cancel one another out when averaged.  % Such an ecological community can have high ecological interaction strength. 
+
+\subsubsection{Impulse response functions\nopunct}\label{sec:mes.irf} (IRFs) of our VAR(1) models correspond to our visualizations of example competition-mutualism networks in §\ref{sec:case.studies}. An IRF predicts how much each group's size would change in response to a sudden increase in the size of each other group \citep{verhoef_community_2010}:
+
+\begin{equation}
+    \mathbf{\Theta_t} = \mathbf{\Theta_{t-1}}\mathbf{\Phi}, t = 1,2,... \label{eq:irf} 
+\end{equation}
+
+\noindent where $\mathbf{\Theta_t}$ is the impulse response function at time $t$.   $\mathbf{\Theta_0}$ is an $M$-by-$M$ identity matrix so our impulses represent a log-unit increase of 1 to each group. $\mathbf{\Theta_t}$ is a matrix with elements $\theta^t_{i,j}$ corresponding to the response of group $j$ to the impulse of group $i$.  We draw an edge $i \rightarrow j$ in the competition-mutualism network if the 95\% CI of $\theta^t_{i,j}$ does not include zero at any time $10>=t>0$.  If $\theta^t_{i,j} >0 $, the edge indicates mutualism and if  $\theta^t_{i,j} < 0$  the edge indicates competition.\footnote{In higher-order VAR($p$) models that use $p>1$ past observations as predictors $\theta^t_{i,j}$ can be less than 0 for some $t_a$ and greater than 0 for some $t_b$. However, this is not possible in the VAR(1) models we use.}  We compute the IRFs with bootstrapped confidence intervals (CI) based on 1,000 samples using the \texttt{vars} \texttt{R} package.
+
+
+% The community matrix $\Phi$ is interpretable as a network of commensal relationships \citep{ives_estimating_2003}. While the coefficients of $\mathbf{\Phi}$ correspond to direct associations between group sizes \cite{novak_characterizing_2016}, commensal relationships can also be indirect. Consider relationships between three groups (A, B, C) such that A partially competes with B and B partially competes with C but A and C have no direct relationship. A VAR(1) model inferring these relationships will have negative coefficients for $\phi_{AB}$ and $\phi_{BC}$  but $\phi_{AC}$ will be nearly zero. 
+
+% TODO plot the examples on figure 1. 
+
+%The central prediction of density dependence theory is that there will be a curviliear, inverse-U-shaped ($\cap$-shaped) relationship between overlap density and growth.  
+
+\subsection{Study C: Predicting growth}
+
+\subsubsection{Average subreddit mutualism\nopunct}\label{sec:mes.sub.mut} $m_j$ is the independent variable for our test of H2 and measures the average influence of other subreddits in the ecological community on a given subreddit $j$, which we calculate by taking the mean of off-diagonal elements of row $j$ of the community matrix:
+
+\begin{equation}\label{eq:average.subreddit.mutualism}
+m_j = \frac{1}{\left|M\right|-1}\sum_{i\in M;i\ne j} \phi_{i,j}
+\end{equation}
+
+\noindent where $M$ is the set of subreddits in the ecological community and $\left|M\right|$ is the number of subreddits in $M$. We use the mean instead of the sum because different ecological communities have different numbers of subreddits.
+
+\subsubsection{Regression models for H2} We test H2 by using likelihood ratio tests to compare Model 1 % (above in \ref{sec:reg.H1}) 
+and Model 2 % in Equation \ref{eq:M2} 
+which adds \emph{average subreddit mutualism} ($m_i$) as a predictor. We also fit Model 3 % in Equation \ref{eq:M3} 
+which we compare to Model 2 to test if overlap density explains variation that average subreddit mutualism does not.
+
+\begin{align}
+\mathrm{Model~2} & & Y_i &= B_0 + B_1 d_{i} + B_2 d^2_{i} + B_3 m_i \label{eq:M2} \\
+\mathrm{Model~3} & & Y_i &= B_0 + B_3 m_i \label{eq:M3} 
+\end{align}
+\noindent where $Y_i$ is the growth of subreddit $i$, $d_i$ is its overlap density, $m_i$ is its average subreddit mutualism, and $B_0$, $B_1$, $B_2$, and $B_3$ are regression coefficients. 
+
+\subsubsection{Forecasting growth using ecological interactions}
+\label{sec:mes.forecasting}
+To test H3, we evaluate whether modeling ecological interactions improves time series forecasting of future participation in online groups by comparing the model in Equation \ref{eq:var1} to a baseline model with  off-diagonal elements of $\mathbf{\Phi}$ fixed to 0. This baseline model is equivalent to our VAR model, but excludes ecological interactions.
+
+We use two forecasting metrics with differing assumptions: root-mean-square-error (RMSE) and the continuous ranked probability score (CRPS).  RMSE is commonly used, non-parametric, and intuitive, but does not take differing scales of the predicted variable or forecast uncertainty into account.  Thus, in our setting it may place excessive weight on the forecasts of larger subreddits where errors may have greater magnitude simply because the absolute magnitude of the variance is greater.  By rewarding forecasts where the true value has high probability under the predictive distribution, the CRPS accounts for variance in the data and rewards forecasts for both accuracy and precision and is thus a ``proper scoring rule'' for evaluating probabilistic forecasts \citep{gneiting_strictly_2007}. Our CRPS calculations assume that the predictive forecast distribution for each community is normal with standard deviations given by the 68.2\% forecast confidence interval. We calculate CRPS using the \texttt{scoringRules} \texttt{R} package \citep{jordan_evaluating_2019}.
+
+\section{Results}
+\label{sec:results}
+
+% The organization of our results follows that of our methods.  We begin with Study A % (§\ref{sec:res:studyA}) 
+% in which we find, as predicted by H1, that the relationship between overlap density and growth is $\cap$-shaped relationship. Then, in Study B,% (§\ref{sec:res.characterizing})
+% we explore a typology of ecological communities along two dimensions: (1) the degree to which a community is mutualistic or competitive, and (2) the overall strength of ecological interactions between the communities member groups. In the N.clusters ecological communities analyzed in our VAR(1) analysis, we find that mutualistic relationships are much more common than competitive ones. Our case studies % (§\ref{sec:case.studies}) 
+% illustrate the typology using 4 example ecological communities.  Finally, in Study C, we do not find support for H2 %in §\ref{sec:res.likelihood.ratio.test} 
+% as adding average subreddit mutualism to the density dependence model does not improve growth prediction. But we do find, in support of H3, that ecological interactions improve forecasting performance in our time series models. 
+
+
+
+\begin{figure*}
+  \centering
+
+\includegraphics[width=\linewidth]{figures/knitr-fig_densityxgrowth-1} 
+
+\caption{Relationship between density and growth.  A 2D histogram of subreddits with overlap density (log-transformed) on the X-axis and the change in the logarithm of the number of distinct commenting users on the Y-axis.  The black line shows the marginal effect of overlap density on growth as predicted by Model 2. The gray region shows the 95\% confidence interval of the marginal effect. \label{fig:density}}
+\end{figure*}
+
+% In §\ref{sec:ecology_background} we presented H1 before RQ1 but we report results for H1 in the same section as H2 since they refer to the same regression model. 
+
+%We first present high-level findings that demonstrate advantages of our community ecology approach upon the overlap density approach. We find that accounting for commensal relationships in time-series models increases forecasting accuracy; that including subreddit average commensalism explains additional variation in subreddit over overlap density; and we compare the conclusions drawn density dependence analysis based on the correlation of overlap density and growth can lead  about the ecological environment than our analysis modeling commensal relationships between groups. Finally, we examine the distribution of \emph{average commensalism} and \emph{average absolute commensalism} to illuminate a typology of ecological communities which we illustrate through
+
+\subsection{Study A: Density Dependence Theory}
+\label{sec:res:studyA}
+
+%As discussed in §\ref{sec:ecology_background}, population ecology approaches in social computing propose that the relationship between overlap-density and growth/survival outcomes reflect an environment that may be competitive, mutualistic, or a mixture of both \citep{wang_impact_2012,zhu_impact_2014}. 
+We test the classical prediction of density dependence theory as formulated in H1 using Model 1 % (Equation \ref{eq:M1} in §\ref{methods:density}) 
+which has first- and second-order terms for the effect of overlap density on growth.  As described in §\ref{sec:ecology_background}, H1 hypothesizes that overlap density will have a curvilinear $\cap$-shaped (inverse-U-shaped) relationship with growth indicated by a positive first-order regression coefficient and a negative second-order coefficient.  
+
+\begin{table}
+  \centering
+
+% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
+% Date and time: Thu, Jul 29, 2021 - 05:22:21 PM
+\begin{tabular}{@{\extracolsep{5pt}}lccc} 
+\\[-1.8ex]\hline 
+\hline \\[-1.8ex] 
+ & Model 1 & Model 2 & Model 3 \\ 
+ Overlap density & 1.50$^{*}$ (0.26) & 1.50$^{*}$ (0.26) &  \\ 
+  Overlap density$^2$ & $-$2.08$^{*}$ (0.41) & $-$2.09$^{*}$ (0.41) &  \\ 
+  Average subreddit commensalism &  & 0.12 (0.26) & 0.11 (0.26) \\ 
+  Constant & $-$0.23$^{*}$ (0.03) & $-$0.23$^{*}$ (0.04) & $-$0.04$^{*}$ (0.01) \\ 
+ \hline \\[-1.8ex] 
+Log Likelihood & -4970 & -4970 & -4986 \\ 
+Observations & 4,090 & 4,090 & 4,090 \\ 
+\hline 
+\hline \\[-1.8ex] 
+\textit{Note:}  & \multicolumn{3}{r}{$^*$p$<0.01$} \\ 
+\end{tabular} 
+
+\caption{Loglinear regression predicting subreddit growth as a function of overlap density. The model supports the prediction of density dependence theory of a $\cap$-shaped relationship between overlap density and growth. \label{tab:density}}
+\end{table}
+
+
+As predicted, we observe a $\cap$-shaped relationship between overlap density and growth.  Figure \ref{fig:density} plots the marginal effects of  overlap density on growth for the median subreddit laid over the data on which the model is fit. Table \ref{tab:density} shows regression coefficients for Models 1-3. For about half of subreddits, increasing  overlap density is associated with higher growth rates.  The point where increasing density ceases to predict increasing growth and begins to predict decreasing growth is at the 49\textsuperscript{th} percentile. 
+Prototypical subreddits at this overlap density grew slightly (95\% CI:[0.001,0.06]).  Yet subreddits at the lower and upper extremes of overlap density slightly declined on average. Typical groups at the 20\textsuperscript{th} percentile of overlap density decline by 1.1 members (95\% CI:[-1.1,-1.15]) and typical groups at the 80\textsuperscript{th} percentile decline by 1.2 members (95\% CI:[-1.1,-1.28]). 
+While we find support for the classical theoretical prediction of a curvilinear, ($\cap$-shaped) relationship between overlap density and growth, this does not imply that relationships between highly overlapping communities are more competitive.  
+% Instead our results below % in §\ref{sec:res.characterizing} 
+% show that relationships in ecological communities of subreddits with high user overlaps are typically mutualistic. 
+
+
+\subsection{Study B: Introducing Community Ecology}
+\label{sec:res.characterizing}
+
+
+
+
+% describe the figure and the main takeaway
+% As described in §\ref{sec:characterizing.ecological.communities}, an ecological community can have positive or negative average ecological interaction §\ref{sec:mes.avg.mut} indicating if it is competitive or mutualistic and ecological interaction strength §\ref{sec:mes.abs.int}  provides a way to distinguish ecological communities with a mixture of competitive and mutualistic interactions from those where ecological interactions are weak. 
+
+Figure \ref{fig:commense.x.abs.commense} visualizes the distribution of average ecological interaction and ecological interaction strength over the 641 ecological communities we identify.  
+We observe ecological communities characterized by strong forms of both mutualism and competition, others having mixtures of the two, and some with few significant ecological interactions.  Mutualism is more common than competition, with the mean community having an average ecological interaction of 0.03 ($t=14.5$, $p<0.001$). We find that 524 clusters (81.7\%) are mutualistic. Not only are most ecological communities mutualistic, but more mutualistic ecological communities have greater ecological interaction strength (Spearman's $\rho=0.58$, $p<0.001$).
+% Note that due to our clustering procedure, our analysis examines ecological interactions among subreddits with relatively high degrees of user overlap.
+Therefore, our community ecology analysis suggests that among groups with similar users, mutualistic ecological interactions are more common than competitive ones.
+
+\begin{figure}
+
+\includegraphics[width=\linewidth]{figures/knitr-plot_commense_x_abs_commense-1} 
+
+\caption{Two-dimensional histogram showing ecological communities on Reddit in our typology.  The X-axis shows the overall degree of mutualism or competition in clusters of subreddits with high user overlap based on the average ecological interaction.  The Y-axis shows the ecological interaction strength representing the overall magnitude of competition or mutualism.}
+\label{fig:commense.x.abs.commense}
+\end{figure}
+
+
+
+\subsubsection{Example ecological communities}
+\label{sec:case.studies}
+
+We present four case studies to illustrate our typology of ecological communities of online groups. Figure \ref{fig:commense.x.abs.commense} shows that we find clusters of subreddits characterized by mutualism, competition, a mixture of mutualism and competition, and few ecological relationships at all. We select one case from each of these four types using our measures of average ecological interaction (§\ref{sec:mes.avg.mut}) and ecological interaction strength (§\ref{sec:mes.abs.int}). To allow for more interesting network structures, we draw our cases from the 367 large clusters having at least five subreddits. 
+
+\input{resources/network-figures.tex}
+
+Figure \ref{fig:networks}, presents visualizations of competition-mutualism networks representing statistically significant impulse response functions as described in §\ref{sec:mes.irf}. During our analysis, we also examined the terms of the vector autoregression parameter $\mathbf{\Phi}$, the impulse response functions, and model fits and forecasts, all of which are available in our online supplement.  We also visited each subreddit in the clusters and read their sidebars and top posts to support our brief qualitative descriptions.
+
+\subsubsection{Mutualism among mental health subreddits}
+
+% TODO, cite somebody on mental health.
+To find a case characterized by mutualism, we selected the top 37 large clusters with the greatest average ecological interaction. From these, we arbitrarily chose one interesting ecological community, the \textit{mental health} cluster, which includes 11 subreddits for supporting people in struggles with mental health, addiction, and surviving abuse.  
+Constitutive subreddits include those focused on specific mental health diagnoses like \texttt{r\Slash bpd} (bipolar disorder) and \texttt{r\Slash cptsd} (complex post traumatic stress disorder) while others like \texttt{r\Slash survivorsofabuse} and \texttt{r\Slash adultsurvivors}
+are support groups. 
+
+The interactions among these subreddits are dense and primarily mutualistic as shown in Figure \ref{fig:mut.network}. There are a handful of competitive interactions like the reciprocal competition detected between \texttt{r\Slash codedependence} and \texttt{r\Slash bpd}. We also observe some interactions that are mutualistic in one direction and competitive in the other. For example, growth in \texttt{r\Slash addiction} predicts increasing growth in \texttt{r\Slash cptsd} even as that growth in \texttt{r\Slash cptsd} predicts decreasing growth in \texttt{r\Slash addiction}. This suggests a pattern in which \texttt{r\Slash cptsd} siphons members from \texttt{r\Slash addiction}. That said, the density of mutualistic interactions shown in Figure \ref{fig:mut.network} suggests that different subreddits have complementary roles in this ecological community as people turn to different types of groups for help with interrelated problems.  While attempting to explain why different online groups form mutualistic or competitive interactions is left to future research, the example of mental health subreddits shows how groups with related topics and overlapping participants can have mutualistic interactions where growth in one predicts growth in many of the rest.
+
+\subsubsection{Competition among real estate and finance subreddits}
+
+
+To find competitive clusters, we selected from the 36 large clusters with the lowest average ecological interaction an ecological community that we label \textit{finance}. Among the 6 subreddits in this cluster, \texttt{r\Slash realestateinvesting}, \texttt{r\Slash realestate} and \texttt{r\Slash commercialrealestate} all deal in different aspects of the real estate industry, while \texttt{r\Slash financialindependence} and \texttt{r\Slash fatfire} (the acronym ``fire'' means ``financial independence/retire early'') are focused on building wealth and becoming financially independent and \texttt{r\Slash financialplanning} is a general purpose subreddit for financial advice.
+
+In contrast to the mental health ecological community, the finance cluster has mostly competitive ties as visualized in Figure \ref{fig:comp.network}. The fact that even this cluster, among the most competitive in our data, contains a number of mutualistic ties reflects just how prevalent mutualism is among subreddits with high degrees of user overlap. That said, we detect three reciprocal competitive interactions among the three subreddits that focus on real estate. The edges from \texttt{r\Slash fatfire} to \texttt{r\Slash commercialrealestate} and \texttt{r\Slash financialindependence} are competitive as well.   
+Interestingly, all interactions between the general finance subreddits (\texttt{r\Slash financialplanning} and \texttt{r\Slash financialindependence}) and \texttt{r\Slash realestate} are mutualistic.
+%Interestingly, are mutualistic.
+
+\subsubsection{Mixed interactions among timepiece subreddits}
+
+Next, we turn to an example of an ecological community with low average ecological interaction but high  ecological interaction strength.  
+We first select the 36 %(10\%) 
+large clusters with the average ecological interaction  closest to 0. To find an ecological community with a mixture of mutualism and competition, we select from the 15 clusters with the greatest ecological interaction strength from within this group and chose the \textit{timepiece} cluster containing 7 subreddits about watches. 
+
+As shown in Figure \ref{fig:mixed.network}, the ecological community of timepiece subreddits is dense with ecological interactions (although not as dense as the mental health subreddits). We observe both reciprocated mutualistic interactions, like that between \texttt{r\Slash rolex} and \texttt{r\Slash gshock}, and competitive interactions like that between \texttt{r\Slash gshock} and \texttt{r\Slash seiko}.  We also observe numerous unreciprocated competitive and mutualistic relationships like the mutualism between \texttt{r\Slash watchexchange} and \texttt{r\Slash watchcirclejerk}\footnote{The suffix is widely understood on Reddit to signify a jokey, meme, or satirical subreddit.}
+and the competition between \texttt{r\Slash japanesewatches} and \texttt{r\Slash seiko}.
+Though the average ecological interaction among these subreddits is near 0, our analysis reveals a complex ecological community with a mixture of competition and mutualism.   
+ 
+\subsubsection{Sparse interactions among Call of Duty subreddits}
+
+To find a case where ecological interactions are weak, we return to the group of the 36 %(10\%) 
+large clusters with the average ecological interaction closest to 0 but select from the 15 clusters within this group with the lowest ecological interaction strength. From these, we chose the \textit{Call of Duty} cluster containing five groups about the popular military first-person shooter series of video games.
+
+% % more quotations
+The Call of Duty ecological community is sparse, having only two significant ecological interactions among its 5 member groups. This ecological community includes subreddits about different editions of the series such as \texttt{r\Slash blackops3}, \texttt{r\Slash infinitewarfar} and \texttt{r\Slash wwii} as well as one about a popular spin-off zombie game \texttt{r\Slash codzombies} and the more general \texttt{r\Slash callofduty} subreddit. We find that  growth in \texttt{r\Slash blackops3} or \texttt{r\Slash codzombies} predicts growth in \texttt{r\Slash infinitewarfare} and no other ecological interactions. 
+
+The timepiece and Call of Duty ecological communities illustrate how subreddits with overlapping users can have relatively strong or weak forms of ecological interdependence.  Although both clusters are characterized by high degrees of user overlap and low average ecological interaction, the timepiece cluster has a dense competition-mutualism network while the call of duty network is sparse.
+    
+\subsection{Study C: Predicting Growth}
+\label{sec:res.studyC}
+
+We now compare the environmental approach of population ecology with the relational approach of community ecology.
+In Study B, we presented examples of diverse ecological communities among subreddits with overlapping members.  However, the presence of this diversity this does not mean that ecological interactions are related to the growth of online groups, the key outcome of previous ecological studies.  We therefore hypothesized that ecological interactions will improve the predictive performance of a density dependence model in H2.
+
+\subsubsection{Ecological interactions do not improve growth prediction}
+\label{sec:res.likelihood.ratio.test}
+
+To test H2, we compare Model 1, our density dependence model having first- and second-order terms for overlap density, with Model 2, which also includes average subreddit mutualism (§\ref{sec:mes.sub.mut}) as a predictor.  We also examine Model 3, in which the only predictor is average subreddit mutualism. Table \ref{tab:density} shows regression coefficients for our models. 
+
+We do not observe a statistically significant association between average subreddit mutualism and growth ($B_3=0.12, SE=0.26$).  
+% We observe that average subreddit mutualism is positively associated with growth , which makes sense as subreddits with greater average subreddit mutualism benefit more from mutualism or are hurt less from competition.
+Moreover, a likelihood ratio test comparing Model 1 and Model 2 does not support H2 as Model 2 does not predict subreddit growth better than Model 1 ($\chi^2 = 0.23$, $p>0.05$). 
+% Therefore, average subreddit mutualism does not help predict growth compared to the density dependence model alone. 
+Comparing Model 2 to Model 3 shows that overlap density explains variation that average subreddit mutualism does not ($\chi^2 = 33$, $p<0.001$). 
+%This suggests that the density of a subreddit's niche helps explain subreddit growth in important ways not captured by ecological interactions.  
+Overlap density helps explain a group's future growth, but the overall degree of mutualism or competition a group faces in its ecological community does not. 
+% In §\ref{sec:discussion}, we discuss how overlap density may only capture the hospitality of a group's environment and may be independent of mutualism and competition within its ecological community.
+
+\subsubsection{Forecasting accuracy}
+\label{sec:res.forecasting}
+
+The likelihood ratio tests in §\ref{sec:res.likelihood.ratio.test} are limited because improvements in predictive performance (or lack thereof) may be due to unobserved factors predictive of growth that are correlated with average subreddit mutualism. We hypothesized in H3 that the intergroup dependencies in our VAR models can better forecast the size of subreddits compared to baseline time series models that do not account for ecological interactions.  As described in §\ref{sec:mes.forecasting}, we test H3 by comparing two forecasting metrics: the root-mean-square-error (RMSE) and the continuous ranked probability score (CRPS).
+
+VAR models including ecological interactions have forecasting performance superior to the baseline model in terms of both RMSE and CRPS. We evaluate the 24-week forecast performance for all  subreddits which were assigned to clusters. The RMSE under the baseline model (0.84) is greater than the RMSE of the VAR models (0.75) and the CRPS of the baseline model (72,853) is also  greater than the CRPS of the VAR models (72,669).  This reflects a substantive improvement in forecast accuracy robust to the choice of the forecasting metric.  
+
+Our baseline model contains a constant term and a trend term for each group and therefore accounts for all time-invariant within-group variation.  Because overlap density is a subreddit-level variable that does not vary over time,
+we know that the improvement in forecasting performance comes from modeling ecological interactions in ways not captured by overlap density.
+
+\section{Threats to Validity}
+\label{sec:limitations}
+Our work is subject to several important threats to validity that we cannot fully address. First, we study ecological communities on only one platform hosting online groups and our results may not generalize to other platforms or time periods.
+Additionally, while our community ecology approach assumes that ecological interactions drive dynamics in the size of groups over time and cause groups to grow or decline, drawing causal inference using our method would depend on several untestable assumptions. For example, our ability to infer causal relationships might be limited if groups we do not consider---including groups on other platforms---play a role in an ecological community. Regression estimates in Models 1-3 may be confounded by omitted variables and cannot support causal interpretation. 
+Therefore, we refrain from claiming that the relationships we infer are causal.
+
+The method we propose for identifying ecological interactions between online groups has limitations common to all time series analysis of observational data. 
+Potential omitted variables might also include additional time lags of group size. Although we chose to use VAR(1) models with only 1 time lag, we hope future work can improve upon our approach and model more complex dynamics with additional lags.
+% Our results are offered as limited temporal associations consistent with inferred ecological interactions.
+Like most other time series analysis, vector autoregression assumes that the error terms are stationary. This is difficult to evaluate empirically and may not be realistic \citep{canova_var_2007}.  Future work might relax these assumptions using more complex models with time-varying parameters, state space models \citep{box-steffensmeier_time_2014}, nonlinear time series models \citep{cenci_regularized_2019, kantz_nonlinear_2003}, or stationarity-enforcing priors \citep{heaps_enforcing_2020}.  Such approaches may require additional contextual knowledge and be difficult to scale to an analysis of hundreds of different ecological communities, but may prove fruitful in future work focusing on ecological communities of interest. Such models may also be useful in future work investigating how ecological interactions change over time.
+
+Additional threats to validity stem from our use of algorithmic clustering to identify ecological communities.
+Organizational ecologists have rarely attempted to estimate the full community matrix for an entire population containing a large number of groups because of data and statistical limitations \citep[e.g.][]{ruef_emergence_2000, sorensen_recruitment-based_2004}. For instance, 100 million possible ecological interactions exist within a set of 10,000 communities.  Attempting to infer them all raises considerable computational and statistical challenges.
+% This makes it necessary to narrow the scope to the ecological communities of interest in ways appropriate to the research question.
+We chose to use a clustering analysis to explore the typical ecological communities on a platform.
+
+% Yet, a 
+
+While we choose clusters based on high degrees of user overlap and validate our clustering in terms of the silhouette coefficient and purity criteria, we might have obtained different results if we had clustered in a different way. Additionally, our efforts to obtain clusters with a high silhouette coefficient lead us to remove a large number of subreddits from our analysis. Thus, our results are not representative of Reddit overall, but only of those subreddits that were included in our analysis.  Furthermore, clustering algorithms like the one we use may not have unique solutions and different initial conditions and hyperparameters might lead to different results. While these allow us to scale up our analysis, future work should use principled definitions of an ecological community based on qualitative contextual knowledge in focused studies of particular ecological communities.
+% future investigations should also consider qualitative approaches to constructing ecological communities.
+% Finally, our three cases studies are limited in that they can offer only a proof-of-concept analysis and an enticing hint at more comprehensive future analyses with more rigorously defined populations of online groups.
+% Although we found varying results in the three ecological communities we selected, these case studies can provide little explanation for when one should expect to find different forms of commensalism in online groups. Our hope is that these initial results can point in new directions for research. 
+% % We looked at three different sets of related online groups and found three qualitatively different ecological communities.  
+% As is true in all case study research, there is little reason to expect findings from any one of our case studies to generalize to any specific other set of contexts.
+
+\section{Discussion}
+\label{sec:discussion}
+
+To introduce community ecology and compare it to population ecology, we presented three studies. In Study A, we found support for H1 showing---as predicted by density dependence theory---that overlap density has an $\cap$-shaped association with subreddit growth.
+Subreddits with moderate overlap density in our data declined less than subreddits with either very low or very high overlap density.
+According to population ecology theory, this suggests that  high-density environments are competitive and less conducive to growth than medium-density environments.
+
+%prevalence of mutualism among highly overlapping subreddits contrast with our results for
+
+Surprisingly, this contrasts with our results in Study B, where we studied the diversity of ecological communities using vector autoregression models of group size over time to infer networks of ecological interactions.
+%surveyed clusters of highly overlapping groups on Reddit to.
+We find ecological communities that are mutualistic or competitive, that mix the two, or that have few significant ecological interactions at all. Overall, however, ecological communities of subreddits are typically mutualistic and mutualistic interactions are stronger on average than competitive ones. Although we find evidence of density dependence, density-dependent competition does not necessarily reflect typical relationships in ecological communities of highly overlapping subreddits.
+
+%As discussed more below, our results are due to the fact that support for H1 does not necessarily mean that most relationships between subreddits with the greatest degrees of user overlap are competitive.
+
+Our results in Study C show that the size of the other members of an ecological community improves time series forecasts of participation in online groups. However, average subreddit mutualism did not help predict growth. 
+This suggests that population ecology and community ecology offer complementary environmental and relational perspectives.   
+Population ecology's focus on environmental factors such as niche and overlap density is useful for predicting growth, but does not provide a way to study networks of mutualism and competition.
+Community ecology unpacks density and provides insights about the specific relationships between groups.  While modeling these interactions helps forecast participation levels in groups, the existence of these interactions may be independent of future growth. For example, if mutualistic relationships are common in declining ecological communities, that would explain our result for H2.
+
+%  these interactions helps time series forecasting, but whether the interactions 
+
+% While we advance community ecology as an alternative framework to population ecology, our results show that population ecology and community ecology are complementary perspectives. 
+% We tested H2 to find out whether including subreddit average mutualism improves the ability of a density dependence model to predict the size of a subreddit n.test weeks in the future and found that it did not. Therefore,
+
+% Yet in support of H3, including ecological interactions in the vector autoregression (VAR) models substantially improves their forecasting performance. 
+ 
+
+% Our  findings in Study A and Study B may appear contradictory, their coincidence in our data points to ways in which population ecology and community ecology conceive of different kinds of ecological dynamics. 
+
+The complementary nature of the two ecologies is seen in the coincidence of our findings in Study A and Study B.   
+Indeed, these results can help explain  the puzzling set of empirical results about the relationship between overlap density and outcomes like growth, decline and survival  \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}.
+Studies of density dependence theory in social computing measure the density of an online group's niche in terms of its overlap in participants or topics. 
+%Resource overlaps seem to reflect competitive forces in some circumstances but mutualistic ones in others. 
+Our analysis clearly shows that resource overlaps between two groups might have little to do with whether they are mutualists or competitors. Instead, overlaps may simply reflect the hospitality of the environment to groups with overlapping topics or user bases.  
+As a result, the differing environmental conditions of Wikis and Usenet groups might explain why user overlap was associated with the survival of wikis \citep{zhu_impact_2014} but with the decline of Usenet groups \citep{wang_impact_2012}. Wikia was a young and growing platform during \citepos{zhu_impact_2014} data collection period when the growth of groups may have been limited by knowledge of how to build a wiki, and this knowledge was provided by overlapping experienced users. 
+Usenet was in decline during \citepos{wang_impact_2012} study period and this may have produced competitive environmental conditions as users became more scarce. 
+%Users of groups with high overlap density may have greater commitment to the platform than to any particular group and competition over such users may become fierce when a platform goes into decline. 
+
+% as users with comm
+
+% because 
+
+% and \citeauthor{tan_all_2015} \cite{tan_all_2015} observe that accounts posting in fewer different groups are more likely to leave a platform.  
+% As \citeauthor{kraut_building_2012} \cite{kraut_building_2012} argue, commitment to subgroups can enhance commitment to a broader group.  This suggests that On the other hand, members of a group with high overlap density may have little commitment to it in particular.  
+
+% This suggests that commitment to a 
+
+% We suggest that when commitment to the platform declines this may amplify competition as 
+% may present environmental conditions for strong competition over those members  
+% This suggests that 
+% Such groups may face greater challenges in sustaining participation when the platform goes into decline. 
+
+The widespread mutualism found in Study B resonates with long-held understandings of ecological interactions in evolutionary theory \citep{kropotkin_mutual_2012}.  Competition is unlikely to persist because it decreases survival. Because mutualism increases survival, it will be favored by natural selection \citep{armstrong_competitive_1980, axelrod_evolution_1981}. Similarly, competition can be avoided if groups adopt specialized roles in their ecological community, a dynamic known as resource partitioning in organizational ecology \citep{carroll_concentration_1985,menge_competition_1972,schoener_resource_1974}. Resource partitioning theory suggests that the competition among real estate subreddits observed in Figure \ref{fig:comp.network} may be due to a lack of specialization.  If specialization does not emerge over time, such groups of competing subreddits may have decreased survival. By contrast, mental health support groups like those observed in Figure \ref{fig:comp.network} appear to have distinctive purposes or roles. Future work to test such mechanisms in ecological communities of online groups may reveal ways that online groups complement or cooperate with each other.
+
+
+%Our results demonstrate population ecology's approach to competition and mutualism in a test of density dependence theory and provide an evaluation of community ecology's ability to predict subreddit growth.
+
+
+%Future work should directly test this hypothesis about the relationships between platform-based and subgroup-based commitment.
+
+% In general, competition over overlapping resources will have no effect on group growth if something besides the overlapping resource limits growth \cite{verhoef_community_2010}. For example, two wikis might share a large number of contributors (have high user overlap), but their growth might be limited by a lack of core contributors who perform important administrative tasks like policy making and software administration \cite{zhu_impact_2014}.   Community ecology relaxes the assumption that competition and mutualism are caused by user overlap density and instead seeks to infer them from data.
+% To illustrate our approach, we presented 4 example ecological communities found on Reddit §\ref{sec:case.studies}.
+Within large platforms for online groups, the great number of ecological communities that can be studied should make it possible for future work to apply methods from network science to construct and test generalizable theories about the roles of different types of resources, design features of platforms, and governance institutions in these ecological interactions. Future work should also incorporate community ecology analysis in case studies of important topics such ecological communities engaged in peer production, political mobilization, misinformation, or mental health support. 
+
+Although we focused on online groups within a single platform, groups may use multiple platforms with distinctive affordances for different purposes \citep{fiesler_moving_2020, kiene_technological_2019}. Since the VAR method relies only on time series data to infer ecological interactions, it can be applied to study ecological communities spanning social media platforms. Community ecology can thus provide a bridge between quantitative studies of participation in online groups and theories of interconnected information ecologies \citep{nardi_information_1999}. While we focus on relationships between groups sharing a platform, one can apply our concepts and methods to understand how interdependent systems of technologies and users give rise to higher levels of social organization on social media platforms \citep{astley_two_1985, aldrich_organizations_2006}. 
+
+\subsection{Implications for Design}
+    
+% While Resnick et al.~\citep{resnick_starting_2012} 
+In the final chapter of their book on \textit{Building Successful Online Communities}, \citet{kraut_building_2012} advise managers of online groups to select an effective niche and beware of competition. However, these recommendations are based on little direct evidence from studies of online groups and offer almost no concrete steps that designer or group should take based on either piece of advice. Although further research into ecological interactions is needed before design principles can be derived, we provide a framework for online group managers to think about ecological constraints on group size. 
+While intuition suggests that online group managers might seek out mutualistic relationships and avoid competitive ones, it is often not obvious whether another group with overlapping users is a competitor or mutualist. 
+Our method provides a way for group managers to know. 
+
+Competitors have a negative impact on growth, but ecological theory suggests that specialization is an adaptive strategy in response to competition \citep{aldrich_organizations_2006, carroll_concentration_1985, kraut_building_2012, powell_network_2005}. 
+%For example, the growth of Wikipedia caused other online encyclopedia projects to shift their focus \cite{hill_almost_2013}. 
+Using our method, group managers might identify competitors limiting the growth of their groups. With the knowledge of this analysis in hand, they might be able to escape a competitive dynamic by specializing. 
+While competitive relationships are defined by how they decrease the size of groups, competition can also be important to the health of the broader ecological community. Exit to an alternative group can be an avenue for political change in response to grievances and poor governance \citep{hirschman_exit_1970, frey_emergence_2019}. The threat of competition with other groups may make expressions of voice more persuasive to moderators or platforms \citep{hirschman_exit_1970}. 
+
+Groups looking to increase activity should desire to seek out mutualistic relationships, and we believe that designers of online platforms can help them do so. Features such as meta-groups, group search, recommendation engines, and practices like linking related groups may lower barriers between groups and support mutualism. However, it is not obvious to what extent particular features will support competition, mutualism, or both.  Using our method, managers and designers can test features intended to support mutualism.
+
+\section{Conclusion}
+
+% Rewrite conclusion
+While explanations for the rise or decline of online groups often look to internal mechanisms, understanding the role of interdependence between online groups is increasingly important.
+While prior research has investigated competition and mutualism among online groups with overlapping users and topics using the population ecology framework \citep{wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}, this approach does not provide a way to infer competitive or mutualistic interactions among related groups.
+We introduce the community ecology framework as a complementary perspective to population ecology. 
+% The two ecologies both seek to explain why online groups grow or survive, but they focus on different levels of analysis \cite{astley_two_1985}.  
+By inferring competition-mutualism networks directly from time-series data, our community ecology approach helps resolve the empirical tensions raised by prior ecological work in social computing and reveal that most interactions within clusters of subreddits with highly overlapping users are mutualistic. Our methods provide a foundation for future work investigating related online groups.  
+% \printbibliography[title={References},heading=secbib]
+
--- a/dissertations/nathante_uw_2021/ch5_conclusion.tex
+++ b/dissertations/nathante_uw_2021/ch5_conclusion.tex
@ -0,0 +1,302 @@
+% \documentclass[12pt]{memoir}
+
+% \usepackage{cdsc-memoir}
+% % there are two chapter styles: cdsc-article and cdsc-memo
+% % memo assumes that you remove the "\\" and the email address from the
+% % \author field below as well as that you will comment out the
+% % \published tag
+% \chapterstyle{cdsc-article}
+
+% \usepackage[utf8]{inputenc}
+% \usepackage{wrapfig}
+% \usepackage[T1]{fontenc}
+% \usepackage{textcomp}
+% \usepackage[garamond]{mathdesign}
+
+% \usepackage[letterpaper,left=1in,right=1in,top=1in,bottom=1in]{geometry}
+
+% % packages i use in essentially every document
+% \usepackage{graphicx}
+% \usepackage{enumerate}
+
+% % packages i use in many documents but leave off by default
+% % \usepackage{amsmath, amsthm, amssymb}
+% % \usepackage{dcolumn}
+% % \usepackage{endfloat}
+
+% % import and customize urls
+% \usepackage[usenames,dvipsnames]{color}
+% \usepackage[breaklinks]{hyperref}
+
+% \hypersetup{colorlinks=true, linkcolor=Black, citecolor=Black, filecolor=Blue,
+%     urlcolor=Blue, unicode=true}
+
+% % list of footnote symbols for \thanks{}
+% \makeatletter
+% \renewcommand*{\@fnsymbol}[1]{\ensuremath{\ifcase#1\or *\or \dagger\or \ddagger\or
+%  \mathsection\or \mathparagraph\or \|\or **\or \dagger\dagger
+%   \or \ddagger\ddagger \else\@ctrerr\fi}}
+% \makeatother
+% \newcommand*\samethanks[1][\value{footnote}]{\footnotemark[#1]}
+
+% % add bibliographic stuff 
+% \usepackage[american]{babel}
+% \usepackage{csquotes}
+% \usepackage[natbib=true, style=apa, backend=biber]{biblatex}
+% \addbibresource{ch5_conclusion.bib}
+% \DeclareLanguageMapping{american}{american-apa}
+
+% \defbibheading{secbib}[\bibname]{%
+%   \section*{#1}%
+%   \markboth{#1}{#1}%
+%   \baselineskip 14.2pt%
+%   \prebibhook}
+
+% \def\citepos#1{\citeauthor{#1}'s (\citeyear{#1})}
+% \def\citespos#1{\citeauthor{#1}' (\citeyear{#1})}
+
+% % memoir function to take out of the space out of the whitespace lists
+% \firmlists
+
+% % LATEX NOTE: these lines will import vc stuff after running `make vc` which
+% % will add version control information to the bottom of each page. This can be
+% % useful for keeping track of which version of a document somebody has:
+% % \input{vc}
+% % \pagestyle{cdsc-page-git}
+
+% % LATEX NOTE: this alternative line will just input a timestamp at the
+% % build process, useful for Overleaf
+% % \pagestyle{cdsc-page-overleaf}
+
+% \begin{document}
+
+% \setlength{\parskip}{4.5pt}
+% % LATEX NOTE: Ideal linespacing is usually said to be between 120-140% the
+% % typeface size. So, for 12pt (default in this document, we're looking for
+% % somewhere between a 14.4-17.4pt \baselineskip.  Single; 1.5 lines; and Double
+% % in MSWord are equivalent to ~117%, 175%, and 233%.
+
+% \baselineskip 16pt
+
+% \title{Future Directions in the Ecology of Online Communities}
+% \author{Nathan TeBlunthuis\\
+%         \href{nathante@uw.edu}{nathante@uw.edu}}
+% \date{September 1, 2021}
+
+% \maketitle
+
+% %  LocalWords:  
+
+% % TO CONCLUSION:
+
+% \section{}
+
+Chapter 1 says that ``the project of this dissertation is to begin reconstructing organizational ecology in the relatively theory-poor but data-rich context of online communities.'' By focusing on understanding the relationships between related online communities in ecological terms of competition and mutualism and in the emic language of members of overlapping communities, the preceding work seeks to build an empirical foundation to build new ecological theory.  It has found qualitative and quantitative evidence that overlapping online communities often fill distinctive niches by providing complementary benefits to their users.  Competitive dynamics also occur, and can be strong, but do not last as long.  Although competition and mutualism play a role in their growth and survival, communities may not adapt to promote mutualism and avoid competition.  Rather it seems likely that the ``principle of competitive exclusion'' takes hold in some other way, perhaps through a selection process in which communities normally must provide complementary benefits to existing ones in order to take off.
+
+Of course, these claims are limited by the empirical tools that were used to support them.
+Inferences about competition and mutualism are based upon time series models with fundamentally untestable assumptions.
+By fitting a far greater number of models than I could carefully specify I have taken an unabashed ``big data'' approach. 
+To make confident claims about any particular competitive or mutualistic relationship between two subreddits I would have to conduct a relatively exacting model selection and comparison procedure based on additional contextual knowledge of the communities' histories.  
+The large scale of this analysis supports the general findings enumerated above assuming that any model misspecifications have not introduced errors in a systematic and misleading way.
+The fact that both the linear and nonlinear time series analyses and the active community members all seem to agree that mutualism is more common than competition provides some reassurance. It seems quite unlikely that all three will mislead in similar ways. 
+
+The reconstruction project is still beginning, but at this stage we can propose preliminary answers to some key theoretical questions: (1) How do people construct systems of overlapping online communities? (2) What types of ``resources'' are most important for mediating ecological interactions? (3) How do ecological interactions relate to broader dynamics such as the growth of a platform or the popularity of a broader topic? and (4) How do barriers between different platforms affect cross-platform ecological relationships?
+
+Question (1) is fundamental to an ecological explanation for the development of online communities. A preliminary answer is that \emph{people construct systems of overlapping online communities as new online communities find distinctive niches in the neighborhood of existing communities relatively early in their development}.  Chapter 4 finds evidence that systems of overlapping online communities are not constructed through an adaptation process and suggests a selection process as an alternative.  It should be noted that selection and adaptation are not mutually exclusive and the systems of overlapping communities may develop through a hybrid process. Chapter 3 suggests that a large majority of active online communities each have a distinctive ecological niche. It seems likely that successful online communities are often quickly find a niche early in their development.   
+
+Prior research and Chapter 2 both see users and topics as related to rival or non-rival resources that make competition and mutualism between online communities possible. However,  Chapter 2 finds that user and topic overlap densities are very weakly correlated with online community growth suggesting that user and content overlaps are not very close analogs for the kinds of resource overlaps considered by organizational ecologists, such as the technological range of a firm's outputs \citep{dobrev_shifting_2003}. Based on findings from Chapter 3, a preliminary answer to question (2) is that \emph{online communities' ecological niches are a product of content categories, audiences, and social capital}. These dimensions of an online community's niche might be difficult to precisely measure, but they can be described in theory. 
+
+Content categories are socially constructed classes such as ``memes,'' ``Q\&A,'' ``news,'' ``commentary,'' ``art,'' ``documentation'' and ``discussion.''  Online communities often specialize in a subset of possible content categories.
+Specialization in a set of content categories might be achieved formally through rules or definitions of topical scope or informally, through the community's size, or the preferences and behaviors of its members. 
+The empirical work so far considers topics measured through semantic similarity or language models.
+Content categories are likely to be correlated with such measures, but the measures are unlikely to faithfully capture important aspects of content categories like differences in medium, genre and form. 
+
+The notion of social capital and audience as distinct aspects of a niche disentangles the concept of ``user.''  Social capital refers to the benefits that come from interpersonal interaction and sense-making with a homophilous or tight-knit community \citep{ackerman_sharing_2013}. Measures of group size or user overlap may be correlated with social capital, but they do little to distinguish a user who comments as a member crowd-like audience from a user who seeks social bonds and interactions with fellow members of their identity group or enthusiasts in their hobby. 
+
+Question (3) is an important part of an ecological explanation for the rise and decline of platforms in terms of the communities they host. A preliminary answer is that \emph{ecological interactions and the rise or decline of a topical area drive one another in feedback process}. Chapter 2 suggests that growing platforms may be more likely to have mutualistic dynamics as they have an increasing number of potential niches for online communities of varying sizes and scopes. At the same time, mutualistic interactions among overlapping communities are likely to drive the rise of a platform as mutualists enrich niches in their neighborhoods. In a similar way, competition and the decline of topical area might reinforce each other if out-migration of users interested in the topic induces competition over the remaining users and this accelerates the communities' declines prompting further out-migration.
+
+Question (4) considers the ecological consequences of how different social media platforms divide related online communities such as the Wikis and subreddits about the same topic. A preliminary answer is that \emph{barriers between platforms limit both mutualistic and competitive dynamics} because of how they limit the sharing of users or content across platforms. However, when non-rival resources such as information and community building know-how are transferable across platforms communities on platforms designed to provide different types of benefits are likely to be mutualists.
+For example, subreddits and Wikis about similar topics are probably mutualists because wikis are designed primarily for developing and sharing encyclopedic information and subreddits often focus on socialization and discussion.
+
+Now I will sketch several possible directions for near-future work in this research program.  Some of these potential projects seek to develop more complete answers for the key theoretical questions and others will bridge ecological analysis to specific practical problems. 
+My hope is that empirical support and theoretical development will soon be sufficiently advanced to inform the design of present and future online community ecosystems and to understand the successes and limitations of peer production.     
+
+\section{Ecological Relationships Between Platforms}
+
+A significant limitation of my empirical studies has been that they focus only on interactions among communities within a single large platform.  However, online communities often overlap across platforms \citep{kiene_technological_2019} and cross-platform interactions are likely to be important \citep{vincent_examining_2018}.  For example, Reddit's growth enormously increased in 2010 when users of rival site Digg.com migrated \emph{en mass}, suggesting that during this period subreddits and Digg sections were in competition \citep{noauthor_digg_2021}.  In chapter 6 of \emph{Building Successful Online Communities}, \citet{resnick_starting_2012} recommend that new online communities ``carve out a useful and defendable (sic) niche in the ecology of competing communities.'' They base this recommendation upon virtually no evidence taken from studies of online communities or organizational ecology but rather by following intuitions drawn from economics and assuming that online communities may find themselves in ``winner-take-all'' situations.  Although they recommend specialization as a strategy for avoiding competition, they also suggest ``lock-in'' features like having different user interfaces and making it so identities cannot be shared between communities. 
+
+At issue is how \citet{resnick_starting_2012} attempt to simultaneously adopt the perspectives of two different types of actors whose interests are often unaligned.
+Commercial platforms need to generate private revenues and seem to better fit the classical models of organizational ecology that have niche overlaps as highly correlated with competition.
+A commercial platform may find mutualism between cross-platform communities a nuisance and may find the ``lock-in'' features unequivocally beneficial.
+However, building a successful online community is not the same as building a platform that hosts online communities.
+My ecological studies of relationships between communities suggest that mutualism is widespread among actually existing online communities within a platform.
+In my conversations with members of overlapping communities, I learned that they often benefit from overlapping communities on different platforms. 
+Therefore it seems likely that that communities on commercial platforms that are both sufficiently ``open'' and sufficiently differentiated  will also be mutualistic, even if the platforms compete with each other over revenues. If so, this points to the promise of designs that support resource sharing across such platforms.
+
+Knowledge about inter-platform ecological dynamics is only beginning to be created.
+\citet{nagaraj_how_2021} have found that open source knowledge projects like open street map are hurt by competition with proprietary alternatives.
+Cross-platform studies of digital traces face difficulties because it is not generally possible to associate user accounts on different platforms.  
+However, the time-series models I have used only depend on finding related communities and therefore enable studying ecological interactions without tracking users across platforms.
+I am developing a new dataset of related subreddits, Fandom.com wikis, and Wikipedia articles to investigate ecological interactions between related communities on different platforms.
+
+\section{Selecting Niche Width}
+
+Choosing a scope is an important design decision for organizations and for online communities. As I found in Chapter 3, broad and narrow scopes are associated with trade-offs in the types of benefits that a community can provide. The choice of scope, or the choice of how a community will specialize, may also have implications for the community's short and long run survival.  According to theories of organizational ecology, the choice of scope may affect a community's competitive and mutualistic dynamics and its ability to weather changes in a turbulent environment.  
+
+Resource partitioning theory, discussed briefly in sections of Chapters 2, 3 and 4, provides a framework for understanding how specialization relates to competition.  It proposes that larger generalists can coexist with specialists because large generalists are not optimally efficient at all of their activities, leaving opportunities for specialists to out-compete them in narrow niches \citep{carroll_concentration_1985}.  Findings from Chapter 2 suggest that one prediction of resource partitioning theory seems to obtain in groups of overlapping online communities. This is that they often have a ``main'' community which is a large generalist and people participate in the specialist communities in order to obtain distinctive benefits not easily obtained in the main community \citep{baum_ecological_2006}.  
+
+A related theory fragment of organizational ecology, niche width theory \citep{dobrev_dynamics_2001, freeman_niche_1983}, proposes that specialists are less able to survive during periods of rapid change. Large generalists may have advantages in changing environments because their diversity of interests which spreads out risk, their experience transferring knowledge between different parts of their organization, and their slack resources can all help them absorb negative outcomes \citep{dobrev_shifting_2003}. 
+As discussed in Chapter 4, online communities may inhabit unstable environments where sudden events, ongoing trends, and abrupt policy changes can all affect participation \citep{ratkiewicz_characterizing_2010}.
+
+An example illustrates how environmental change can threaten the success of specialists. During the Trump administration, a number of anti-trump subreddits were organized around specific controversies (e.g., \texttt{r\Slash the\_meuller, r\Slash marchagainsttrump, r\Slash keep\_track, r\Slash russialago}).
+\texttt{r\Slash the\_mueller} was a subreddit about the Special Counsel's investigation into Russian election interference. 
+% As shown in \ref{fig:the.meuller}, 
+the number of posts in these subreddits declined following the end of the investigation. However, this subreddit has survived by successfully adapted and now has several posts a day critical of Trump but not specifically about the Meuller investigation. Yet a similar subreddit, \texttt{r\Slash russialago} has declined to a much lower activity level (a few posts a week) but remains focused on Russian interference. By comparison, the number of posts in the generalist (but still left-leaning) \texttt{r\Slash politics} has remained relatively stable.  
+ Niche width theory would predict that shifting to more general types of anti-Trump content may expose \texttt{r\Slash the\_mueller} to greater competition with other political subreddits. However, if it had not adapted it might have little reason to exist after the end of Mueller's investigation.
+
+Theories of online community specialization can be empirically testable with better quantification of the ways that overlapping communities are different from one another.  These include features of content like choice of medium (text, images, video, links), content sources (what websites are they linking to?), types of participants with varying roles and styles of participation, and structures like policies, size and moderation.  Niche width theory additionally requires measuring environmental changes that may threaten the survival of communities. Observable events corresponding to interesting environmental variation may include crisis events, elections and the release or cancellation of entertainment products. Comparing the growth, performance, and ecological dynamics of overlapping communities during times of high or low change can test these theories and point toward design principles for online community scoping that account for the trade-offs in different types of specialization.
+
+
+% Other studies in organizational ecology, and in biological ecology more generally, resource partitioning refers to how different groups specialize to minimize niche overlaps and avoid competition.   
+
+\section{Ecological Implications for Production and Performance}
+
+So far, the ecology of online communities has focused on understanding competition and mutualism among overlapping online communities.  An important limitation of this work has been to conceptualize competition and mutualism as dynamics related to the growth of online communities.  This follows biologists and organizational ecologists, but not all online communities have to grow in order to provide their intended benefits \citep{foote_starting_2017}.  An important step forward this research program will be to relate interdependence between online communities to outcomes besides growth that may be more directly connected to the value of the public goods that communities produce.
+
+Quantifying the value of public information goods produced by online communities is a major methodological and theoretical challenge.  Much of the field of economics depends on the assumption that the utility of a good can be measured by its price.  Price is a valuable measure of value in economic theory because it is set by market mechanisms that align supply and demand.   Online communities are thought to be able to produce public goods because they can lower transaction costs \citep{benkler_coases_2002}. Negotiating a price in these settings is simply not worth it. A price will reintroduce transaction costs and undercut the pro-social motivations people have for contributing.  
+
+Of course, this does not mean the public goods online communities produce are worthless. Estimates of the cost of replacing by paying editors a market rate placed its value between 6 and 10 billion dollars in 2013 \citep{band_wikipedias_2013}.  However, without a price mechanism, supply and demand may become ``misaligned.'' The quality of Wikipedia articles is uneven and the most popular content is often not the highest quality \citep{warncke-wang_misalignment_2015, gorbatai_exploring_2011}.
+In classical economic theories, goods will be produced to meet the demand, but in peer production the size of an audience seems only weakly related to the level of production.
+Explaining when online communities will produce high quality public goods like Wikipedia articles \citep{arazy_evolutionary_2019,arazy_determinants_2010,asthana_few_2018} or open source software \citep{champion_underproduction_2021} is thus  important to understanding the successes and failures of peer production.
+
+% Critical mass theory can potentially explain how supply and demand can be linked in public goods production and can also be synthesized with ecology \citep{marwell_critical_1993}.  The central mechanism of the theory the notion of a ``production function,''  which maps a quantity of contributor input to a level of good produced. The theory proposes that the shape of the production function is determined by the collective action problem that a group must overcome to produce the good and determines the level of the good that will be produced by rational actors. Some prior research applies this theory to online communities, but does not operationalize its central propositions related to production functions \citep{solomon_critical_2014}. 
+
+Critical mass theory offers to explain the conditions for successful collective action in public goods production and can also be synthesized with ecology \citep{marwell_critical_1993}. Many CSCW systems appear to require a critical mass of users to start or sustain their usefulness \citep{ackerman_intellectual_2000}.  The most important device in the theory is the \emph{production function}, which maps an individual's contributions to the value they get from contributing. The theory proposes that the shape of the production function is determined by the collective action problem that a group faces in producing the good.  If a production function is \emph{accelerating} (\emph{decelerating}) then a contribution increases (decreases) the payoff of the next contribution.
+The rational actors in a group each have their own production function and together these determine the level of the good that they will produced. Some prior research applies this theory to Wikipedia, but does not attempt to measure value of contribution  or operationalize the theory's propositions about the relationship between production functions and collective action \citep{raban_empirical_2010, solomon_critical_2014}. Analyzing critical mass theory in the context of communal public goods production can also be an important theoretical contribution to communication theory \citep{fulk_connective_1996}. 
+
+To illustrate, consider a hypothetical example of the construction of an online community for building a collaborative knowledge base, such as Wikidata. 
+This can be cast as a collective action problem because the project can provide a wide range of benefits to a potentially large group of people, but no individual can provide the full range benefits alone \citep{marwell_critical_1993, fulk_connective_1996}.
+Say a single individual, the community's founder who is an expert engineer and researcher, attempts to bootstrap the community by providing an initial design and implementation for the novel system, a small number of entries and by making efforts to publicize the community.
+The founder hopes that others to join and contribute to constructing a valuable resource.
+
+During this period in the community's development, the \emph{critical mass} consists of just the founder, who is motivated and capable of  in the hopes that others will see these contributions and subsequently make their own.  The founder has a large and unique set of resources enabling them pay the \emph{start-up costs} involved in founding the community when no one else would.  After these start-up costs are paid, others can make much more granular contributions like adding entries to the knowledge base. The founder hopes that others will perceive expected benefits from contributing that exceed the costs of contributing. 
+% This might not happen and if time goes by and noone else contributes, the founder, all alone and discouraged, might conclude that it is not longer work making their own contributions. 
+In theoretical terms, the founder hopes that the others' production functions are accelerating and paying the start up costs will move the others' production functions into a favorable region where they will contribute. 
+
+% If some time goes by and noone else contributes, the founder, all alone and discouraged, might conclude that it is no longer worth making their own contributions.  Now the community has failed to hold on to a critical mass and becomes inactive. But say that the founder's early contributions have been useful to somone else (member 2) who chooses to make their own contributions because they expect to benefit from the ``warm glow'' of reciprocity, through social interaction with the founder, or from the future contributions that their own contributions might attract. 
+
+Ecology has important implications for critical mass theory because important aspects of the collective action problem that influence the production function are related to the composition of the group and prior work suggests that individuals with varying experiences are important to online community growth \citep{kairam_life_2012}.
+Heterogeneous groups are thought to be conducive to collective action because they are more likely to contain individuals who can contribute different things like start up costs or rare pieces of information \cite{fulk_connective_1996}.  
+% This makes it easier to form a critical mass of individuals who can make start up contributions \citep{marwell_critical_1993}.
+
+Returning to the example of a collaborative knowledge base, it is important to recognize that many contributions will involve \emph{articulation work} activities like documenting, answering questions, naming, and interpreting that are required to make the knowledge base work in practice \citep{schmidt_taking_1992, suchman_supporting_1996}.
+Even though contributions of articulation work might not directly add new features or data to the knowledge base, they can be important to accelerating community members' production functions.
+A heterogeneous community may be more likely to include members who are skilled at articulation work that benefits other members.
+On the other hand, If different subgroups of a large community have sufficiently different application areas some articulation work might be specific to each subgroup. 
+For example, biologists might make and document biology-specific norms for the collaborative knowledge base, but this would not be useful to physicists.
+Thus individuals' production functions might depend most strongly on the other members of their subgroup when subgroup-specific articulation work is a limiting factor.
+
+% At the same time, the utility of a collaborative knowledge overall often depends on linking to knowledge outside of one's domain of expertiseso the .
+
+
+
+
+
+
+% elaborate on what a collective action problem is.
+% develop the example of a distributed database and why it might be hard to do collective production of it at different stages or phases
+% at different phases of developement the distributed database the critical mass needed to maintain the collective action dynamic is differnet in composition or in form.  
+% Ecology is related to critical mass theory because important aspects of the collective action problem that influence the production function are related to the composition of the group. Heterogenous groups are thought to be condusive to production because they are more likely to contain individuals who will contribute very much and therby make it easier to form a ``critical mass'' of individuals who can overcome the start-up costs common in collective action problems \citep{marwell_critical_1993}. For example, a start-up cost for an open source database might take the form of an initial design for a novel system that can only be provided by expert engineers or researchers. But once the initial system is developed, additional features, bug fixes, and documentation can be added by a much broader group of developers who wish to use the system in their applications.  Therefore, open source community's ability to biuld a valuable system depends on including both database experts and application developers.  Prior work suggests that individual with varying experiences are important to online community growth \citep{T}.
+
+I am starting work to find out how production functions help explain when online communities achieve critical mass and produce quality outputs and if relationships among communities influence the shape of production functions in ways that make collective action easier or more difficult in different conditions.
+Measuring production functions requires the ability to precisely quantify the quality or value of individual contributions.
+As a step in this direction, I have developed an improved measurement of Wikipedia article quality in research accepted for publication and included in Appendix A.
+Prior article quality measures have been based on machine learning models that do not provide a continuous measure amenable to statistical analysis and that were miscalibrated for units of analysis like articles or projects.
+Research using these measures has got around these problems by adopting an assumption that article quality levels on Wikipedia are ``evenly spaced'' from one another.
+I use a method that relaxes this assumption, provides evidence that it is unfounded, and improves the accuracy of the models.
+
+I have also done some methodological work on the ``demand side'' to understand how audiences use Wikipedia content.  Most prior work has been limited to measuring page views.  In Appendix B, I study the amount of time spent reading articles by Wikipedia visitors and find that readers in the Global South remain on pages for longer, especially in the last page view in a session. 
+Although the measure used in that study may not be available for use in the future,  this work has prepared me for the time when better reading time data is available.
+It will be interesting to see if the audience for an article relates to critical mass dynamics.
+
+\section{Ecology and the Diffusion of Technologies for Community Governance}
+
+Future ecological research can also look at the role of ecological dynamics in the emergence and diffusion of novel artifacts, technologies, information and ideas. Overlapping technology use in particular is a potential mechanism for specialization and mutualism.  I have previously suggested that sharing a host platform may not be sufficient for defining an organizational form because communities have considerable flexibility in making their own rules and configuring their own custom technology.  If sufficiently strong patterns are found in the sets of rules or technologies that communities adopt, these might justify treating communities sharing such structures as organizational forms or at least a potentially important kind of niche overlap. 
+
+When online communities share technologies, this can create important forms of interdependence and collaborative innovation on tools is potentially an important type of mutualism.
+For example \citet{chandrasekharan_crossmod:_2019} developed a system called ``Cross Mod'' for subreddits to collaborate on customizable machine learning models for monitoring misbehavior.
+Smaller communities pooling data about rule violations can potentially build more accurate models than single communities can.   
+Technologies like Cross Mod allow communities to select which other communities they wish to import data from and therefore are most useful when communities are institutionally compatible.
+This suggests that sharing governance technologies may be a good proxy for an organizational form. 
+
+However, as I found in Appendix C, my study of algorithmic flagging tools on Wikipedia, machine learning tools for predicting misbehavior may reproduce the biases of community moderators.
+They can also improve the fairness of moderator judgments if moderators use the models instead of other biased social signals to find potential misbehavior. 
+Additional risks may arise when algorithmic tools are shared by overlapping communities.
+The learned norms and and standards of behavior from one community may not be appropriate  in other communities.
+If shared flagging algorithms can more easily implement norms that are more widely held, the diffusion of an algorithm that makes regulating behavior easier and more predictable might mediate the diffusion of the norm.
+
+The method I developed for the study in Appendix C provides a way to assess the consequences of a machine learning classifier without intervening in a community.
+Future work at the intersection of ecology and online community governance might use this method in a study of the relationships between the performance of algorithms for enforcing different rules, the diffusion of the rules, and the growth and survival of communities having the rules.
+
+
+\section{Microfoundations for Ecological Macrodynamics}
+
+% Good chance this micro-macro stuff heads to the conclusion. Let's keep trying to make it work for now. These 3 paragraphs seem like a  good argument for a study that links individual behavior or user flows to competition/mutualism or density.
+
+Predominant approaches in HCI and social computing and popular conceptions of social media platforms most often emphasize the role of managers of platforms in building online communities.
+However, platforms have only a limited control over the ways that users build communities.
+Furthermore, platforms struggle to maintain participants who may migrate to competing platforms.
+Communities and their organizers can engage in collective action to protest platform's governance and design decisions \citep{matias_going_2016}.  
+Online communities also form intermediate structures over which platforms have limited influence such as the widespread clusters of highly overlapping communities I identify in Chapter 2. 
+An important goal of the ecology of online communities is to understand how patterns of action within individual communities are co-constitutive with the cultures and institutions of platforms.  
+%Overlapping online communities exist because individuals participate in them, but individuals cannot participate in communities that don't exist. 
+This goal faces a key type of puzzle in social science: to account for how  ``micro-level'' individual actors give rise to ``macro-level'' organizations, institutions, online communities, and cultures even as individuals are situated within these very structures.
+
+
+% This was because taking up inter-organizational dependence as an object of study raised a similar micro-macro puzzle.
+Micro-macro puzzles are not only found in the constitution of individual persons and the social structures they inhabit.
+Organizational ecology takes up a different kind of micro-macro puzzle at the level of reciprocal dependence between organizations and the organizational fields or industries they comprise.
+The performance of an individual organization depends on ecological dynamics in its organizational field, but the organization itself contributes to these very dynamics.
+Initial work in organizational ecology avoided this reciprocal causation by minimizing the action of individual organizations.
+Structural inertia constrained the agency of organizational actors, and external institutions, competition, and legitimacy constrained organizational performance. 
+
+At first, organizational ecologists did not deny that factors internal to organizations matter to organizational performance.
+Yet they argued that \emph{ceteris paribus}, the chances of an organization's survival depend on environmental conditions and on mutualistic and competitive pressures \citep{hannan_organizational_1989}.
+Later on, organizational ecology began accounting for rational adaptation and failure of individual organizations \citep{baum_ecological_2006}. 
+Recently they have incorporated the role of human cognition and social learning into their conceptualizations \citep{hannan_concepts_2019}, but as far as I am aware, empirical analyses have not stretched all the way from individual persons to inter-organizational dynamics.
+
+Online communities provide a distinctive opportunity to connect individual behaviors to outcomes at the community and ecological levels thanks to the finely grained behavioral data that made possible the analyses in Chapters 2 and 4.
+However, all of the measures used in these projects have aggregated the behavior of many individuals into measures of overlap or group size.
+I have not shown how the ways that individuals navigate among overlapping online communities give rise to the ecological dynamics I find.
+Aware of this limitation, I initially proposed constructing an agent-based model to theorize the micro-mechanisms of ecological dynamics.
+Along the way, I found that talking to individuals provided a more valuable micro-level account of how and why people participate in overlapping online communities.
+
+These interviews surfaced a conceptual model of a process by which new communities in a topical area spin-off specialists. 
+An important direction for future research will be to operationalize and test this model with data.
+This future work should look for inspiration from measures of individual behavior introduced in recent research in HCI and social computing \citep{tan_tracing_2018, tan_all_2015, zhang_understanding_2021}.
+Specifically, \citep{tan_tracing_2018} provide a method to associate newly created subreddits with prior subreddits whose users join the new subreddit and measure the language use of individuals to characterize their similarity to the other members of the community. Also, \citet{waller_generalists_2019} quantify users of online communities as generalists and specialists based on their activity styles using embedding methods. 
+
+\section{Focused Case Studies}
+
+% find a better rationale than this?
+% Why haven't we done this already? (b/c not as scientific?)
+Finally, in order for ecological research in online communities to be useful to publics and practitioners, it will be important to conduct focused case studies of practical and popular interest.
+Studies of the ecology of political communities, communities tying to make sense of the pandemic, ``meme stock'' and cryptocurrency communities, and pop culture fandom communities are all promising candidates.
+A future project should investigate one or more cases in a mixed-methods study combining carefully constructed time series models for inferring ecological relationships and qualitative data in the form of grounded narrative accounts or interviews.
+
+In conclusion, my research set out to understand interdependence among online communities through the lens of organizational ecology.
+It has questioned the how well foundational assumptions of organizational ecology apply to online communities and set out to validate basic assumptions like when online communities will form competitive or mutualistic relationships.
+It has provided new methods for studying competition and mutualism among online communities and shown that mutualistic relationships are more common than competitive ones because they last longer.
+Although the question of how groups of mutualistic online communities are constructed remains open, selection process theories provide a starting point for future investigation.
+Many applications of ecological theories and methods to important questions about the emergence, performance, and design of online communities are promising.
+% As I continue my work, I am releasing well-documented code and datasets to support this future work and I hope, other research yet unimagined.
+
+% bibliography here
+\setcounter{biburlnumpenalty}{9001}
+\printbibliography[title = {References}, heading=secbib]
+
+
+% \end{document}                  %
+
+%  LocalWords:  
+%%% reftex-default-bibliography: ("ch5_conclusion.bib")
--- a/dissertations/nathante_uw_2021/copyright_page.pdf
+++ b/dissertations/nathante_uw_2021/copyright_page.pdf
--- a/dissertations/nathante_uw_2021/diss_ecology_of_online_communities.pdf
+++ b/dissertations/nathante_uw_2021/diss_ecology_of_online_communities.pdf
--- a/dissertations/nathante_uw_2021/ecological_models.bib
+++ b/dissertations/nathante_uw_2021/ecological_models.bib
--- a/dissertations/nathante_uw_2021/equalogy.tex
+++ b/dissertations/nathante_uw_2021/equalogy.tex
@ -0,0 +1,923 @@
+\chapterprecishere{
+Large-scale quantitative analyses have shown that individuals frequently talk to each other about similar things in different online spaces. Why do these overlapping communities exist? We provide an answer grounded in the analysis of 20 interviews with active participants in clusters of highly related subreddits: within a broad topical area, there are a diversity of benefits an online community can confer. These include (a) specific information and discussions, (b) socialization with similar others, and (c) attention from the largest possible audience. A single community cannot meet all needs. 
+Our findings suggest that topical areas within an online community platform tend to become populated by groups of specialized communities with diverse sizes, topical boundaries, and rules. Compared with any single community, such systems of overlapping communities are able to provide a greater range of benefits.
+
+}
+
+
+\section{Introduction}
+
+Early work in social computing treated online communities as isolated units that could be understood without considering their members' participation in other online communities. As community hosting platforms such as Reddit and Facebook have grown in prominence, social computing scholars have sought to document and explore the connections between online communities \citep{datta_extracting_2019, hill_studying_2019, tan_all_2015, zhu_selecting_2014}.
+This research has shown that online communities overlap with each other in terms of their memberships and topics in ways that have important consequences for a range of outcomes \citep{teblunthuis_identifying_2021, chandrasekharan_internets_2018, wang_impact_2012}.
+
+
+User and topic overlap is widespread---both within platforms and across them. 
+For example, a range of studies have highlighted the fact that members frequently participate in multiple online groups. 
+This occurs both serially as users migrate between communities over time \citep{lu_investigate_2019, tan_all_2015, tan_tracing_2018} and concurrently as individuals belong to multiple groups at once \citep{wang_impact_2012, hwang_why_2021, zhu_impact_2014}. 
+Many large platforms host distinct communities with similar topics and content \citep{datta_identifying_2017, zhu_selecting_2014}.
+In at least one study, researchers have documented that overlaps in users and topics often coincide \citep{datta_identifying_2017}.
+In other words, members of online communities often simultaneously participate in overlapping conversations with overlapping groups of people in different online spaces. 
+
+\textit{Why are the same individuals talking to each other about similar things in different online communities?}
+Although social computing offers many theories of why individuals might want to participate in a community, almost all empirical work in social computing on user and topic overlap has used computational or quantitative analysis. As a result, we know very little about what overlaps mean to users. Critically, we also have very little in the way of empirical evidence that is able to speak to why communities overlap in the first place.
+
+Our work seeks to complement existing quantitative research with a better qualitative understanding of intercommunity overlap and contribute to several streams of social computing scholarship. 
+In particular, our work complements a series of social computing studies that have taken inspiration from ecological theory and shown that online groups' growth and survival are closely tied to activity in adjacent online spaces \citep{teblunthuis_identifying_2021, wang_impact_2012, zhu_impact_2014, zhu_selecting_2014}. 
+
+
+We seek to answer our research question (in italics above) through an interview-based study of Reddit users with experience in overlapping communities. Using a dataset of posts and comments on Reddit, we identify clusters of communities on Reddit with highly overlapping users and topics and recruit a set of 20 participants from nine clusters.
+Drawing from a grounded theory analysis of interview transcripts, we develop an explanation of why many users simultaneously participate in communities with overlapping memberships and topics. 
+
+Our findings suggest that users seek three salient benefits from online groups: users want to (a) find specific types of content, discussions, and information; (b) connect with similar types of people; and (c) share content with the largest possible audience. Our work also suggests that these three benefits are frequently in conflict such that the more a community provides one of these benefits, the less able it may be to provide the other two. Because it is difficult for a single community to fully provide all three benefits, clusters of multiple overlapping communities are constructed to do so in aggregate.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+\section{Related Work}
+
+
+
+
+
+
+
+\label{sec:overlapping}
+
+
+Although most research in online communities analyzes the internal factors driving online community success \citep{kraut_building_2012}, a growing literature studies communities related by overlaps in topic or membership \citep{datta_identifying_2017, tan_all_2015, teblunthuis_identifying_2021, zhu_impact_2014}. 
+This work has found that concurrent engagement in multiple communities is common on large platforms that host online communities such as Reddit where individuals smoothly jump from community to community \citep{tan_all_2015}.  
+With several exceptions \citep[e.g.,][]{fiesler_moving_2020, kiene_technological_2019,  zhao_social_2016, hwang_why_2021}, this work typically takes the fact that communities overlap for granted and focuses on the consequences of overlap on outcomes such as the emergence and growth of communities \citep{butler_cross-purposes_2011, zhu_impact_2014} and the diffusion of types of language such as hate speech \citep{chandrasekharan_internets_2018}. None of this work provides insight into how communities come to overlap and why these overlaps persist.
+
+Researchers have investigated intercommunity conflict and found that conflict is initiated by a very small proportion of online communities \citep{kumar_community_2018}.  
+Other work has shown that content cross-posted to different communities contributes to the ongoing renegotiation of the topical boundaries \citep{butler_cross-purposes_2011}. 
+\citet{zhang_understanding_2021} have shown that topical boundaries can also shift as similar communities attract users with different interests.
+In a related sense, \citet{massanari_gamergate_2017} has argued that toxic communities can influence the broader culture of a platform for online communities. As a result, banning problematic communities from a platform such as Reddit can reduce toxicity in adjacent communities that are not directly affected \citep{chandrasekharan_you_2017, ribeiro_platform_2021}.
+
+
+A number of studies on overlapping communities draw upon ecological theory \citep{teblunthuis_identifying_2021, wang_impact_2012,  zhu_impact_2014, zhu_selecting_2014}.
+Ecological approaches in social computing theorize that overlaps between users and topics relate to competitive or mutualistic forces and drive outcomes such as growth and survival. For example, \citet{wang_impact_2012} found that membership overlap reduced the growth rate of Usenet groups. \citet{zhu_selecting_2014} found that participation rates often suffered if there was too little or too much overlap with other communities. 
+\citet{zhu_impact_2014} found that communities' survival was positively associated with membership overlap, especially with overlap with older communities. Recently, \citet{teblunthuis_identifying_2021} found that mutualism is common in clusters of overlapping subreddits.
+
+Although these studies use statistical analysis to tell us about how communities relate to each other, they do not to speak to \textit{how} participants understand the relationships between similar online communities or \textit{why} they participate in overlapping communities. The exclusively quantitative nature of these accounts means that a range of potential explanations are possible.
+
+Although we know of no qualitative examination focused directly on understanding why overlapping communities exist, there are a series of qualitative papers that point to potential answers. \citet{fiesler_moving_2020} describe the history of online fanfiction writing communities migrating across platforms in pursuit of hospitable infrastructure. Similarly, \citet{zhao_social_2016} describe how individuals use multiple social media platforms to meet varied and nuanced communication needs.
+Although their study is primarily quantitative, \citet{zhu_selecting_2014} include quotes from interviews to support the emic validity of notions of competition and mutualism between groups in an enterprise social media system.  
+Finally, \citepos{hwang_why_2021} paper seeks to explain why individuals participate in persistently small online communities on Reddit and ends with a reflection that many small communities are sustainable only because they are ``nested'' within larger niches.
+All told, these findings suggest a rich social process by which participants in online communities purposefully construct and move between overlapping spaces.
+
+
+% Social computing scholarship has pointed to differences in affordances between platforms and the ways in which members migrate between communities over time.
+However, the very small amount of qualitative evidence from participants in overlapping communities in the same platform means that we lack a strong sense of why members choose to participate in multiple communities simultaneously. 
+Although ecological studies attempt to quantify competition and mutualism, we know little about how members understand the relationships between their communities or if these key ecological concepts have any emic resonance.  
+Our work seeks to place ecological studies of online communities on firmer qualitative ground.
+
+
+
+
+
+
+
+
+
+
+
+
+
+\subsection{Reasons for Joining Online Communities}
+\label{sec:reasons}
+
+Decades of social computing research has sought to understand why people belong to particular online communities \citep{kraut_building_2012}. 
+It has long been recognized that different people have different motivations and that a single individual may have multiple motivations that include the social, informational, and material benefits users receive through their participation \citep{butler_membership_2001,  turner_where_2005, xigen_li_factors_2011}. In terms of uses and gratifications theory, ``users actively seek particular media with the goal of gratifying an existing need'' \citep{lampe_motivations_2010}. 
+Past research has shown that people seek online communities to collaborate on projects \citep{poor_computer_2014}, to receive social support \citep{leimeister_evaluation_2005}, to cooperate with friends \citep{turner_where_2005}, and, especially, to exchange information \citep{ridings_virtual_2004, muhtaseb_arab_2008, leavitt_role_2017, liang_knowledge_2017}. 
+Other research focuses on the growth and decline of membership in online communities and surfaces motivations for why people choose not to participate \citep{cunha_are_2019}. \citet{brandtzaeg_user_2008} found that a lack of trust or low quality content can lead to declines in membership.  
+Online communities may decline because leaders are resistant to change and unwelcoming to newcomers \citep{shaw_laboratories_2014, halfaker_rise_2013, teblunthuis_revisiting_2018}. 
+
+
+
+
+Although our findings are the result of an inductive process of bottom-up grounded theory analysis, the presentation of our findings relies on three existing concepts.
+
+
+
+
+
+
+\subsubsection{Finding specific content}
+
+One of the most important features of online communities is their ability to enable the spread of useful knowledge and information \citep{faraj_online_2016}. By connecting individuals with specific information and skills that they desire, online communities match knowledge seekers with experts and foster collaboration on information goods \citep{benkler_wealth_2006,  lakhani_how_2003, fulk_connective_1996,fiesler_growing_2017}.
+Research has often focused on the ways that individuals utilize diverse types of social computing systems to meet their specific information needs through systems such as Q\&A sites \citep{adamic_knowledge_2008}, synchronous chat systems \citep{white_effects_2011}, search engines \citep{morris_comparison_2010}, social network sites \citep{starbird_crowd_2012,morris_what_2010}, fanworks \citep{fiesler_growing_2017}, and knowledge bases \citep{ackerman_answer_1990,orlikowski_learning_1992}.  
+
+
+
+
+
+\subsubsection{Homophily}
+
+A second need that online communities serve is to foster connections with similar others. The term \textit{homophily}, ``a tendency for friendships to form between those who are alike in a designated respect''  \citep{lazarsfeld_friendship_1954}, describes the set of benefits people can only receive from others who share their identities, beliefs, interests, or culture \citep{mcpherson_birds_2001}.  
+In offline settings, homophily helps explain why tastes in cuisine, music, and other cultural preferences are often correlated \citep{dellaposta_why_2015}, why similar people tend to congregate, and what happens when they do \citep{mcpherson_birds_2001}. 
+Homophily on social networks may drive the emergence of online ``echo chambers'' as individuals seek online communities whose members share their beliefs \citep{johnson_communication_2009, grevet_managing_2014, himelboim_valence-based_2016, dvir-gvirsman_media_2017}. 
+
+Research has shown that people have greater degrees of trust in homophilous groups and are more likely to share content posted by homophilous others \citep{ma_when_2019, chang_specialization_2014}.
+Homophily has been described as an important feature of online fan communities \citep{hillman_alksjdflksfd_2014,fiesler_growing_2017}.
+
+
+
+
+
+
+\subsubsection{Finding the largest possible audience}
+
+Research on online communities producing public information goods has found evidence that audience size motivates contributors \citep{zhang_group_2011}. Additionally, numerous studies have shown that users of social networking sites frequently consider the audience that their posts and messages may reach \citep{marwick_i_2011, zhang_configuring_2020}. 
+As individuals on social media typically have little information about who sees their posts, they conceive of ``imagined audiences'' based on cues from visible activity \citep{bernstein_quantifying_2013} and target imagined audiences using deliberate strategies, such as using multiple platforms to reach distinct audiences, in order to control who sees or does not see their posts \citep{litt_just_2016, marwick_i_2011, zhao_social_2016}. 
+
+
+
+
+
+
+
+
+
+ 
+
+\section{Study Design}
+\label{sec:methods}
+
+To study overlapping membership in online communities, we conduct interviews with members of online communities hosted on Reddit, a social media platform for sharing, discussing, and rating news, media, and other content in user-created subcommunities called ``subreddits.'' Individual users can participate in any of Reddit's millions of subreddit communities by posting ``submissions'' that might include a link to a news article, a question for discussion, an image, or text written by the submitter. Each submission has a corresponding threaded comments section. Users can also vote submissions and comments up or down as a form of distributed moderation and can give awards to comments and posts \citep{lampe_slashdot_2004, burtch_how_2021}.
+
+Subreddit communities are managed by teams of volunteer content moderators tasked with curtailing abusive behavior and keeping conversation on topic \citep{matias_civic_2019, seering_moderator_2019}. 
+Subreddits exist covering an enormous range of topics \citep{fiesler_reddit_2018}, and
+Reddit has been the site of much research on overlapping online communities \citep{datta_identifying_2017, tan_all_2015, tan_tracing_2018, hessel_science_2016, teblunthuis_identifying_2021}.
+Because the cost of creating and joining new communities on Reddit is very low, subreddits often overlap in both topic and membership. Users frequently create spinoff subreddit communities from larger and more established groups \citep{tan_tracing_2018}.
+
+\subsection{Participant Selection}
+
+To understand why people participate in overlapping communities, we set out to interview people who are active in highly related subreddits. Additional inclusion criteria were that users were adults (i.e., above the age of majority in their country) and able to participate in an interview in English.  
+
+Our participant selection process began by first choosing clusters of highly related groups. To do so, we built a web-based data visualization of a clustering algorithm derived from user overlap to identify groups of interest-based subreddits having similar users.
+To generate the visualization, we conducted a computational analysis of the Pushshift Reddit dump \citep{baumgartner_pushshift_2020}, containing a nearly complete collection of Reddit comments made before April 2020. We selected the top 10,000 subreddits based on the number of comments in this data and excluded subreddits where a majority of submissions were flagged as not safe for work. Next, following an approach described in prior work \citep{datta_identifying_2017}, we constructed the measure of user similarity by taking the cosine similarities of TF-IDF vectors. Using this similarity measure, we ran affinity propagation clustering \citep{frey_clustering_2007} to group subreddits having overlapping users.  We then built an HTML visualization of these clusters based on t-distributed stochastic neighbor embedding (t-SNE). We have included the visualization in our online supplement. 
+
+Although some aspects of our manual cluster selection process using this visualization were necessarily arbitrary, we tried to select clusters that were interest driven, involved primarily English language discussion, and were focused on content about which all members of the research team would be comfortable speaking. As a result, we did not select any clusters that were focused on sex or pornography, fringe or extreme politics, content specific to geographic regions, or topics that our group could not understand. 
+
+We sought out clusters that we hoped would result in individuals from a diverse range of ages, genders, and life experiences. Although we did not collect demographic information from our interviewees, our interviewees' presentation and descriptions of themselves suggested that these efforts were not entirely successful. 
+Our pool of interviewees included young and middle-aged people; people of color; people from the United States, Canada, and Europe; people who did not speak English as a first language; and people who were non-male. 
+That said, men were very likely over-represented in our pool of interviewees, perhaps even in relation to the disproportionate participation of men on Reddit \citep{amaya_new_2021}. 
+
+The clusters we selected each include 3--10 subreddits on the following topics: rock climbing, streetwear fashion, roller coasters, vintage audio, podcasting, painting, drag culture and performance, indie music, and dating for middle-aged adults. Information about each subreddit and cluster can be found in Table \ref{tab:subs_clusters_stats}.
+
+
+\begin{table}[h!]
+  \footnotesize
+    \centering
+    \begin{tabular}{cccc}
+\hline
+  \textbf{Subreddit}    & \textbf{Cluster}  & \textbf{Subscribers} & \textbf{Created} \\
+  \hline
+\rowcolor{lavenderblue} 
+r/bouldering          & Climbing          & 194,814              & 2009-10-28       \\ 
+\rowcolor{lavenderblue} 
+r/climbharder         & Climbing          & 117,288              & 2010-10-19       \\ 
+\rowcolor{lavenderblue} 
+r/climbing            & Climbing          & 935,621              & 2008-07-17       \\ 
+\rowcolor{lavenderblue} 
+r/climbingcirclejerk  & Climbing          & 45,032               & 2011-08-18       \\ 
+r/Drag                & Drag              & 44,724               & 2011-01-15       \\ 
+r/Dragula             & Drag              & 27,510               & 2016-11-03       \\ 
+r/rupaulsdragrace     & Drag              & 440,329              & 2011-11-15       \\ 
+r/RPDR\_UK            & Drag              & 31,867               & 2019-02-07       \\ 
+r/SpoiledDragRace     & Drag              & 69,027               & 2018-02-16       \\ 
+r/MsPaintsArtRace     & Drag              & 61,292               & 2017-04-17       \\ 
+\rowcolor{lavenderblue} 
+r/MGMT                & Indie Music       & 17,744               & 2010-02-25       \\ 
+\rowcolor{lavenderblue} 
+r/tameimpala          & Indie Music       & 94,248               & 2011-10-30       \\ 
+\rowcolor{lavenderblue} 
+r/kgatlw              & Indie Music       & 59,191               & 2015-07-01       \\ 
+\rowcolor{lavenderblue} 
+r/Indieheads          & Indie Music       & 1,932,698            & 2013-12-24       \\ 
+r/datingoverthirty    & Middle Age Dating & 436,480              & 2014-11-04       \\ 
+r/DatingAfterThirty   & Middle Age Dating & 11,550               & 2018-03-09       \\ 
+r/datingoverforty     & Middle Age Dating & 52,522               & 2018-12-15       \\ 
+r/relationshipsover35 & Middle Age Dating & 14,916               & 2018-02-06       \\ 
+\rowcolor{lavenderblue} 
+r/OilPainting         & Painting          & 186,716              & 2011-09-22       \\ 
+\rowcolor{lavenderblue} 
+r/Painting            & Painting          & 280,865              & 2008-06-13       \\ 
+\rowcolor{lavenderblue} 
+r/PourPainting        & Painting          & 178,800              & 2017-07-28       \\ 
+\rowcolor{lavenderblue} 
+r/Watercolor          & Painting          & 269,882              & 2012-01-15       \\ 
+\rowcolor{lavenderblue} 
+r/HappyTrees          & Painting          & 53,362               & 2011-02-07       \\ 
+r/podcasts            & Podcasting        & 1,995,693            & 2008-01-25       \\ 
+r/podcast             & Podcasting        & 60,497               & 2009-01-02       \\ 
+r/podcasting          & Podcasting        & 73,010               & 2010-09-17       \\ 
+r/audiodrama          & Podcasting        & 129,102              & 2010-11-30       \\ 
+r/ska                 & Podcasting        & 34,397               & 2008-03-12       \\ 
+\rowcolor{lavenderblue} 
+r/guessthecoaster     & Rollercoasters    & 5,094                & 2017-06-30       \\ 
+\rowcolor{lavenderblue} 
+r/rollercoasterjerk   & Rollercoasters    & 12,378               & 2016-07-14       \\ 
+\rowcolor{lavenderblue} 
+r/rollercoasters      & Rollercoasters    & 66,652               & 2010-07-31       \\ 
+\rowcolor{lavenderblue} 
+r/rct                 & Rollercoasters    & 55,275               & 2010-08-04       \\ 
+\rowcolor{lavenderblue} 
+r/themeparkitect      & Rollercoasters    & 13,536               & 2014-06-16       \\ 
+r/streetwear          & Streetwear        & 2,678,745            & 2011-04-30       \\ 
+r/supremeclothing     & Streetwear        & 154,797              & 2012-04-04       \\ 
+r/womensstreetwear    & Streetwear        & 421,279              & 2016-04-25       \\ 
+r/bapeheads           & Streetwear        & 19,672               & 2013-08-12       \\ 
+r/malefashion         & Streetwear        & 207,843              & 2011-04-02       \\ 
+r/sadboys             & Streetwear        & 74,932               & 2013-06-30       \\ 
+r/techwearclothing    & Streetwear        & 94,675               & 2017-03-01       \\ 
+r/Vans                & Streetwear        & 51,997               & 2011-07-01       \\ 
+\rowcolor{lavenderblue} 
+r/cassetteculture     & Vintage Audio     & 45,615               & 2011-05-25       \\ 
+\rowcolor{lavenderblue} 
+r/typewriters         & Vintage Audio     & 20,037               & 2010-10-25       \\ 
+\rowcolor{lavenderblue} 
+r/vintageaudio        & Vintage Audio     & 59,202               & 2011-09-18       \\ 
+\hline
+    \end{tabular}
+    \caption{Clusters of subreddits from which we recruited participants, subscriber counts at the time of the study, and the creation date of each subreddit.}
+    \label{tab:subs_clusters_stats}
+ \end{table}
+
+
+Using the Pushshift Reddit dataset, we identified candidate participants who were among the top 80\% most frequent commenters within each cluster, who participated in multiple subreddits in the cluster, and who were active in the cluster during a period of at least 1 calendar year. 
+We began recruiting a random sample of 50 candidates matching these criteria within each cluster by sending direct messages through Reddit. Interested potential recruits filled out a short online survey confirming that they were adults and able to participate in English language interviews. The survey also asked participants about their participation and familiarity with each of the subreddits in each cluster to verify that they were knowledgeable. 
+At the beginning of each interview, we asked if there were any other subreddits related to those identified by the clustering algorithm. As a result, our conversations were not limited to the subreddits listed in Table \ref{tab:subs_clusters_stats}.
+
+We began by recruiting participants from the first three clusters listed in Table \ref{tab:subs_clusters_stats}. We found ourselves reaching saturation within these clusters quickly. We also found that different clusters were surfacing quite different data. In response, we added additional clusters and recruited at least two participants from each until we reached global saturation.  In some clusters, we did not reach saturation in two interviews. In these cases, we sent additional invitations and conducted additional interviews. 
+In total, 20 participants were successfully recruited and interviewed by five members of the research team before we reached global saturation and ceased data collection. The characteristics of our interviewees are presented in Table \ref{table:participants}.
+
+All of our interviews were semistructured. Although we drew from a long series of open-ended questions about participation in different subreddits and the relationships between communities, we chose our questions based on what our subjects wanted to talk about. A copy of our interview protocol is included in our supplementary material.
+Interviews were 49 min long on average but varied substantially in length. We suggested conducting interviews over Zoom but offered participants their choice of communication channel. As a result, we conducted two interviews over the phone, one using Discord chat, and the rest over Zoom.
+Interviews were transcribed automatically using Zoom's built-in transcription and the otter.ai service and were then manually corrected by the authors. After each interview, participants were compensated with a digital gift card for \$20 USD through the Tango Card reward service\footnote{\url{https://www.tangocard.com/}}
+
+
+
+\subsection{Qualitative Data Analysis}
+
+Our analysis followed \citepos{charmaz_constructing_2015} approach to grounded theory as closely as possible. We conducted coding and data collection in parallel. We generated over 950 codes, which we then grouped in an iterative axial coding process that generated 18 thematic memos.  As we completed collecting data, we refined our codes and combined themes to identify answers to our following orienting research questions: Why are there so many similar online communities? And why not more? Although primarily inductive, our analysis was influenced by sensitizing concepts from prior work including our knowledge of scholarship on overlapping online communities described in §\ref{sec:overlapping} and the reasons that people participate in online communities summarized in §\ref{sec:reasons}.
+In analyzing our data, we noted that interviewees described their participation in multiple different subreddits and their preference for particular subreddits in terms of the inability of one community (often the ``main'' or ``largest'' community) to provide the desired benefits. This observation formed the basis of the grounded theory around which we organize our findings. 
+
+\begin{table}[h!]
+\footnotesize
+    \centering
+    \begin{tabular}{ccc}
+\hline
+\textbf{Participant ID} & \textbf{Cluster}     & \textbf{Interview Length (min)} \\ \hline
+\rowcolor{lavenderblue} 
+C1                      & Climbing             & 56                                  \\ 
+\rowcolor{lavenderblue} 
+C2                      & Climbing             & 51                                  \\ 
+\rowcolor{lavenderblue} 
+C3                      & Climbing             & 41                                  \\ 
+D1                      & Drag                 & 51                                  \\ 
+D2                      & Drag                 & 67                                  \\ 
+\rowcolor{lavenderblue}
+I1                      & Indie Music          & 71                                  \\ 
+\rowcolor{lavenderblue}
+I2                      & Indie Music          & 43                                  \\ 
+O1                      & Podcasting           & 30                                  \\ 
+O2                      & Podcasting           & 44                                  \\ 
+\rowcolor{lavenderblue} 
+P1                      & Painting             &  58                                   \\ 
+\rowcolor{lavenderblue} 
+P2                      & Painting             & 35                                  \\ 
+\rowcolor{lavenderblue} 
+P3                      & Painting             & 40                                  \\ 
+\rowcolor{lavenderblue} 
+P4                      & Painting             & 35                                  \\ 
+R1                      & Rollercoasters       & 24                                  \\ 
+R2                      & Rollercoasters       & 43                                  \\ 
+\rowcolor{lavenderblue} 
+S1                      & Streetwear           & 79                                  \\ 
+\rowcolor{lavenderblue} 
+S2                      & Streetwear           & 55                                  \\ 
+T1                      & Dating in Middle Age & 63                                  \\ 
+T2                      & Dating in Middle Age & 53                                  \\ 
+\rowcolor{lavenderblue} 
+V1                      & Vintage Audio        & 34                                  \\ 
+\rowcolor{lavenderblue} 
+V2                      & Vintage Audio        & 56                                  \\ \hline
+    \end{tabular}
+\caption{List of anonymized participant IDs, the cluster from which we recruited them, and the length of their interview.}
+\label{table:participants}
+\end{table}
+
+
+\subsection{Ethical Considerations}
+Our study design was reviewed by the Institutional Review Board (IRB) at the University of Washington and was determined to be exempt. As part of the design of this study, we took several steps to protect the privacy of our research participants.  Participants were fully briefed about the design of the study before being interviewed and were given documents concerning the study and contact information for our IRB. Explicit consent was obtained from every participant.
+
+Because this project involved collaboration with a relatively large team, we used the Keybase end-to-end encryption service for all discussion and data sharing.
+Finally, participants were anonymized so that no direct identifier was recorded in the process of data collection, and only anonymized pseudonyms (e.g., C1, P2, and V2, as show in Table \ref{table:participants}) are published in this paper. We made several minor edits to quotes to obsure potentially identifying details.
+
+
+%
+\section{Findings}
+
+Why do people participate in multiple online communities around the same topic? The answer that emerged from our grounded theory is that no one community can provide all the benefits that users want. At a high level, we find that people have multiple and diverse motivations for participation in online communities. In §\ref{sec:benefits}, we describe the types of benefits they seek organized into three categories: (a) engaging with specific types of content, (b) homophilous socialization, and (c) sharing content contributions with as large an audience as possible. 
+In §4.2, we use data from our interviews to describe the tensions between these benefits.
+We also investigate how our interviewees understood competition and mutualism---key concepts from ecological studies in social computing---between overlapping communities. Our interviewees overwhelmingly found mutualism to be more consistent with their understandings of overlapping online communities than competition.
+Our contribution comes in the form of a theoretical framework, grounded in our data, that describes how the full benefits of participating in communities can only be satisfied by groups of communities.  
+
+
+\subsection{Benefits Users Seek from Communities}
+\label{sec:benefits}
+
+
+\subsubsection{Specific kinds of content}
+\label{sec:content}
+
+
+Content on Reddit is organized into subreddits that define their own topical boundaries. These boundaries may be broad (e.g., news) or narrow (e.g., types of painting media). Moreover, subreddits that prohibit types of content or behavior generate niches for subreddits with different rules. Despite such forms of specialization, multiple communities often welcome the same content and encourage users to ``cross-post'' material.
+
+A subreddit's topic---what it is about and what content should be posted---is often signified by its name. A climbing enthusiastic explains:
+
+\blockquote[C1]{I think the name itself [\texttt{r/climbharder}], kind of specifically points out that: this is not for people who climb hard. It's for people who climb and want to climb hard\textit{er}. 
+}
+
+\noindent C1 describes how the purpose of a subreddit is tied to its name by emphasizing the adjectival suffix ``-er'' as indicative of the fact that the subreddit is not about achieving elite performance but about improving.
+
+Similarly, a participant in subreddits about drag performance invokes Marshall McLuhan to describe how they know what content to post and where to post them:
+
+\blockquote[D1]{ Let’s say you were a drag artist and you wanted to show off something that you just created. You would have to go select which community you wanted to show it off in. And I guess among those, [\texttt{r/Drag}] would be the one to do that in. But if you’re---if you’re wanting to show off a piece of artwork or something that you made of a queen from Rupaul's drag race---and the best place to show that off would be to go to [\texttt{r/rupaulsdragrace}] and post it there. So it’s [a] `the medium is the message' kind of thing. \ldots You know where would get the most views [and] where would be the best place to post your content.}
+
+\noindent Like D1, our informants had deep knowledge of what kinds of specific content would be appropriate for each subreddit in their cluster. 
+
+
+Specialization also occurred as a form of regulatory arbitrage when one community had formal or informal rules about the kind of content that was allowed. In these cases, we would often hear about an adjacent community where breaking the rules is accepted, perhaps even the raison d'être. For example, \texttt{r/rupaulsdragrace} prohibits spoilers and information about the outcomes of a reality TV show. \texttt{r/spoileddragrace} is a community about the same show that allows spoilers.
+
+This pattern is so widespread on Reddit that it is often signaled in subreddit naming conventions \citep{hessel_science_2016}. The ``meta'' prefix signals meta-discussions, often drama-centered, about another subreddit. The ``jerk'' suffix signals a space for memes, mockery, silliness, or other content unaccepted in the ``main'' subreddit. Both are commonly understood and were discussed at length by our interviewees. 
+For example, among the Rollercoasters subreddits, R1 described the ``jerk'' subreddit as a ``joke subreddit'' where members of the main rollercoasters subreddit could make fun of themselves:
+
+\blockquote[R1]{I would definitely say \texttt{r/rollercoasters} and \texttt{r/rollercoasterjerk} are really deeply intertwined. It's usually all the same members and stuff because of the fact that the coaster `jerk' is just meant to make fun of the main subreddit. It's just a joke subreddit.}
+
+
+
+\noindent ``Jerk'' subreddits were a common source of discussion among our participants. 
+
+Among the Climbing subreddits, the ``main'' subreddit about rock climbing (\texttt{r/climbing}) is welcoming to newcomers. C1 explained that members upvote posts by newcomers``to encourage more entrance into the sport.'' However, newcomer posts are often repetitive pictures of people climbing in gyms or videos of famous climbers. This annoys some experienced climbers. The ``jerk'' subreddit provides a backstage space where making fun of newcomers is permitted.
+
+In addition to being divided by rules, interrelated subreddits can be structured as a ladder of ``conceptual rungs'' where one finds larger communities as one ascends the ladder. A participant in the subreddits on art and painting described this phenomenon as
+
+
+
+
+\blockquote[P2]{
+You go up through these conceptual rungs.
+\ldots\ 
+When you go up from, say, \texttt{r/OilPainting}---like \texttt{r/HappyTrees} to \texttt{r/OilPainting}---it’s a much bigger community. And then from \texttt{r/OilPainting} to \texttt{Painting}, which is even bigger.
+}
+
+\noindent P2 explained that smaller subreddits such as \texttt{/r/HappyTrees} support learners and are generally more welcoming places. Although the quotation above suggests that the size of communities increases as one moves up conceptual rungs, the relationship between topical scope and size was more complicated. In some topical areas, subreddits with relatively specific topics have the largest and most active communities. For example, \texttt{/r/rupaulsdragrace} is the most active drag subreddit by a large margin, even though it focuses on a reality TV series that is part of the broader drag community covered by \texttt{r/drag}.
+
+Although many specialized subreddits exist, people who want to share their work, ask a question, or have a specific discussion may not know the best place to post. Cross-posting---i.e., when someone posts the same content, questions, or messages in multiple communities---is widespread on Reddit.
+Cross-posting has sometimes been viewed negatively as a form of attention grabbing (i.e., ``karma whoring'') \citep{poor_mechanisms_2005}.
+More often, however, we heard that cross-posting was acceptable and even encouraged to establish complementary conversations or find different audiences. 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Multiple interviewees from the Climbing cluster, including C1, described how, when people ask for training advice in \texttt{r/climbing}, the largest subreddit about rock climbing, they will be advised to cross-post to \texttt{r/climbharder}:
+
+\blockquote[C1]{Somebody will post asking for advice in \texttt{r/climbing} and oftentimes, somebody will comment and be like, `Hey, you know? You’re welcome to ask this here, but you might get more and better responses at \texttt{r/climbharder}.'}
+
+\noindent C1 explained that even though conversations about training often start in the main subreddit, they are not likely to gain traction because not everybody in the main community is interested in the more intensive aspects of climbing.  
+
+
+
+
+
+
+
+
+
+
+In sum, the ecosystem of subreddits about similar topics provides more opportunities for people to find specific desired discussions. People receive positive feedback and engagement when they post content that fits a subreddit's specific topic. That said, the subreddit where a particular piece of content will be best received is often not clear to the person posting it. Cross-posting provides multiple chances to start a desired discussion. 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+\subsubsection{Homophily}
+
+
+
+Online communities have long been recognized as a way to ``find my people'' by bringing together users who share things as diverse as a psychiatric diagnosis, enthusiasm for a hobby, or membership in a subculture or identity group. A member of the Middle Age Dating cluster of subreddits explains: 
+
+
+\blockquote[T2]{[When I joined the ADHD Reddit sites], I feel like I found my people after all these years.
+\ldots\
+If you don't have ADHD, and don't wonder what's going on other people's brains all the time, I think you just think that everybody thinks like you. And they don't. They don't. So if you're 30 and you're having a problem, you really just want to talk to other 30 somethings.
+}
+
+\noindent T2's description of having ``found my people'' and talking to other people like themselves invokes the idea of homophily: the desire to connect to others similar to oneself. 
+Analytically distinct from finding personalized information in narrowly focused subreddits, homophily was frequently cited as an end in itself by our interviewees.
+Our interviewees sought to connect with ``like-minded'' people having similar interests, demographics, identities, tastes, and status.
+
+Even though the identities of others in subreddit communities are largely invisible, participants can easily imagine the demography of the subreddit. A participant in the Drag cluster of subreddits described \texttt{r/Dragula}, a community of fans of a TV show featuring horror-infused drag styles, as follows:
+
+
+
+
+
+\blockquote[D2]{ 
+I think it would be a mostly LGBTQ audience. And not many straights. But if there are straights, they would be really open minded or edgy. Or, I don’t know \ldots\  associated with that `dark' aesthetic.}
+
+\noindent D2's thoughts on \texttt{r/Dragula} convey a clear sense of the audience the subreddit. Of course, the pseudonymous nature of Reddit obscures age, race, gender, and ethnicity. That said, Reddit users draw on stereotypes about fanbases and cues such as mentions of schools, selfie posts, linguistic markers, and cultural references to build clear models of the types of people in a subreddit. In further unpacking these dimensions, D2 contrasts \texttt{r/Dragula} with the more mainstream subreddits about the show \textit{Rupaul's Drag Race}: 
+
+\blockquote[D2]{ [As for subreddits about] the drag race (\texttt{r/rupaulsdragrace}), Drag Race UK (\texttt{r/RPDR\_UK}), and the spoiled drag race (\texttt{r/SpoiledDragRace}). \ldots\ 
+Most of [the participants in these other groups] don’t do drag. Most of them are, I think, white gay men, or straight women who see drag with a very narrow view of what drag is. Hegemonic? I don’t know if that’s the word, but they apply the same standards of beauty that are applied to women and men and artists and performers to this art form. 
+}
+
+\noindent D2 conveyed both a strong sense of the demographics of different drag subreddits and a strong sense of identification with \texttt{r/Dragula}, which they described as less toxic, more inclusive, and more creative, in part because its membership has a greater concentration of LGBTQ and non-White people who are less interested in conforming to hegemonic beauty standards.  
+
+
+
+
+
+Subreddits divide broad topical areas such as drag, art, and fashion into subgroups of people occupying strata of status hierarchies associated with identity, expertise, and class. For example, in the Climbing cluster of subreddits, rock climbing ability confers status and separates beginners from advanced athletes. We found that these two groups concentrate their participation in different subreddits. Across the clusters, we found that experts sought out fellow experts with whom to share knowledge, offer reflections, and give advice grounded in shared extensive experience.
+
+Our Streetware interviewees reported that subreddits about fashion are split along lines that are associated with the price and status of the clothes being discussed:
+
+
+
+\blockquote[S1]{The kind of person, the Platonic ideal poster or user of something like \texttt{r/streetwear}, is probably more open-minded, maybe, in terms of what they think is cool, what they think is worth wearing. Whereas, you know, \ldots\ the \texttt{r/malefashion} snob is a snob.}
+
+\noindent Even though users of \texttt{r/streetwear} share and discuss men's fashion, \texttt{r/malefashion}, which focuses on higher-status and more expensive styles, looks down on their casual and youthful styles.
+S1 is a member of the \texttt{r/streetwear} subreddit. 
+Although their groups are ``chill'' and ``supportive,'' higher-status groups are ``snobby.'' It is clear that S1 feels unwelcome and out of place in the higher-status group.
+
+Similarly, our interviewees described status hierarchies in Painting subreddits related to skill level and medium.
+P4 described how they were invited to cross-post their work from \texttt{r/Watercolor} to \texttt{r/Artoilpainting}, a smaller subreddit that seems to have a complicated relationship with watercolor.  Although watercolor submissions are allowed, and, in this instance, encouraged, both the subreddit's name and the similarity between its visual tag for watercolor submissions with the downvote button suggest that oil is the preferred medium in this community.
+In this way, the division of topical spaces into spheres of similar status and identity allows members to find groups that exclude both those who look down on them and those who they look down upon.
+
+
+
+
+
+
+
+Although ``finding your people'' is satisfying in itself, it can also be a foundation for a wide range of other kinds of benefits. 
+For example, a homophilous community leads to conversations that can promote trust. Trust has many benefits such as building confidence in the advice and information shared within a community. In some communities we studied, this trust enabled buying, selling, and trading of material goods.
+
+V2, one of our interviewees from the Vintage Audio cluster, described a community of record collectors on Reddit that acted as a market for buying, selling, and trading records. They preferred this subreddit to other online markets such as Ebay because the community holds members accountable for honest transacting and because of the intrinsic reward that comes from sharing records with a fellow community member:
+
+\blockquote[V2]{Because it's a group of people that are like-minded, \ldots\ your feet are kind of held to the fire a little bit more about actually being realistic with the condition [of the material you are selling]. Whereas, [when you buy] vinyl at the used record shop, sometimes you feel like someone's trying to pull one over on you \ldots\ I feel like because it is a community, sometimes you can get some kind of better deal \ldots\ \\ I found other people that share the hobby that I like. So I almost, definitely, feel like they’re friends in a little way. And so I want to, if I’m ever selling, I’m going out of my way to make sure that whatever I’m doing, everything I’m doing, is above board.
+}
+
+V2 was very enthusiastic about the ``marketplace wrapped in a community'' for vinyl records. According to V2, both buyers and sellers of records benefit from transacting within a community of like-minded hobbyists. Because the community holds sellers accountable, the community promotes honest representation of merchandise.  Being part of a like-minded community where members feel friendship with each other gives sellers a reason to be honest, and even to discount their wares, because they get ``some kind of better deal.''
+
+In sum, our interviewees turned to specific subreddits to find people who share their interests, tastes, problems, and identities.
+Our participants described subreddits in terms of demographics and identity groups as well as styles, subgenres, or categories related to social status such as wealth, expertise, and beauty standards. They used these categories to place themselves within the constellation of related subreddits they participated in.
+Members of subreddits who are ``finding their people'' benefit each other by acting as communities as well as building trust and feelings of friendship. Over time, these feelings can provide further benefits such as the ability to more safely engage in buying and selling. 
+
+\subsubsection{Finding the largest possible audience}
+
+A third type of benefit derives from the number of members in a subreddit.
+All our interviewees were keenly aware of the fact that a post reaching one of the top positions on a larger subreddit would receive the attention of a vast audience.  They described this attention as emotionally thrilling and otherwise beneficial. For artists and influencers, large audiences brought material rewards. For learners, a large audience's collective knowledge could bring hard-to-find answers and advice.
+
+That said, our interviewees explained that larger subreddits do not necessarily provide a larger audience because posts in larger subreddits are more likely to be ignored or missed in the torrent of other content. Although posting in a smaller subreddit might increase the chances of finding an audience at all, subreddits that were too small were described as unattractive because they would not attract many posts or replies. Interviewees responded by choosing where to post strategically.
+
+Although the competition for the top spots on the front page of large subreddits can be fierce, this competition can make recognition from a large subreddit extremely gratifying: 
+
+
+
+\blockquote[P2]{Likes are just kind of fake: fake social currency. But yeah, when you get a charge out of it, yeah, I love it. Most of the time, painting is a really busy sub. I mean, like, in any given hour, the new page is already replaced.
+\ldots\ \\ 
+If you can get something that gets a hold there and stays on the front page for a little while, [if] it gets up in even the top five, I've had a handful do that. That's kind of cool. 
+}
+
+\noindent P2 describes the thrill of reaching top positions in \texttt{r/painting} with posts of their paintings.  Even though they are dismissive of likes on Reddit, they desire the attention their work gets from the subreddit. It sends traffic to their websites, raises their artistic profile, and helps them sell their art. Although these material incentives are important, part of the thrill comes from knowing that a given subreddit is competitive. Smaller subreddits are simply unable to provide these benefits. 
+
+
+However, posting in a large subreddit means the risk of being ignored:
+
+
+
+
+
+
+
+\blockquote[S2]{I think there’s this weird bell curve where the community needs to be big enough where people want to post content. But it can’t get too big where people are drowning each other out for attention.}
+
+\noindent S2 was among several of our interviewees who described an ideal ``middle ground'' for subreddit size. In general, we heard that people were less interested in posting content in very small subreddits that do not provide an audience. Thus, competition over the largest audiences drives people to smaller subreddits where they can reliably find an audience. I2 from our Indie Music cluster explained:
+
+\blockquote[I2]{Usually \texttt{r/Indieheads} is the way to reach more people if you want to. Just like if you wanted to do even more, you’d probably do it on \texttt{r/music}. \ldots\  Say a small indie band decided to do an AMA they would probably want to do it on \texttt{r/Indieheads}. Because if they did it on \texttt{r/music}, it would get drowned out and nobody would see it because there’s so many posts. In \texttt{r/Indieheads} it would get a decent bit of attention, I think. In the band subreddit, it would probably get a lot of attention too. But \texttt{r/Indieheads} seems like the best middle ground for that kind of thing.
+}
+
+\noindent I2 explained that when the psych-rock band \textit{King Gizzard and the Wizard Lizard} wanted to engage with an audience on Reddit, they had a choice whether to post in the smaller ``band subreddit'' dedicated to them, the very large \texttt{r/music}, or the medium-sized \texttt{r/Indieheads}. Although posting in the band subreddit would have surely provided an audience, they chose \texttt{r/Indieheads}, which was large but where there was still little risk that their post would be drowned out.  
+
+Our interviewees repeatedly described how finding an audience for one's content is a clear motivation for posting in larger subreddits.
+However, we also heard that competition for attention in the largest subreddits leads people to try to find an audience in smaller subreddits. 
+In the smallest subreddits, posting may not seem worthwhile at all.
+This trade-off between finding a large audience and being ignored suggests that posting in subreddits of intermediate size can be the most reliable way to reach a sizable audience. 
+
+
+
+
+
+
+
+
+
+%
+ \subsection{Tensions Between the Benefits}
+\label{sec:tradeoffs}
+
+
+
+The findings in the previous sections imply a clear reason that so many overlapping subreddits exist. When one subreddit prohibits a certain type of content or conversation, an adjacent group can form that allows it. When an identity group is marginalized in one subreddit, members of that group may form a subreddit of their own. When getting attention in a large subreddit is too difficult, a smaller subreddit becomes attractive.
+Using data from our interviewees, we describe each of the three possible tensions that exist between the three benefits: (1) subreddits where one finds a large audience are less able to provide specific types of content; (2) communities with large audiences are rarely able to provide a community of similar others; (3) some valuable types of discussion and information are found only in diverse groups of people.  As we discuss in §\ref{sec:discussion.trillemma}, taken together, these tensions form a ``trilemma''---i.e., a choice with three mutually incompatable options---between our interviewees' desires for specific content, homophily, and finding audiences. A single community might provide two of these benefits, but almost never all three. 
+
+
+
+
+
+
+\subsubsection{Larger audiences create background noise}
+
+In §\ref{sec:content}, we described how subreddits are structured according to distinctions between different types of content. Breaking topical areas into subreddits of varying levels of granularity makes finding specific content easier because doing so reduces the need to sift through unrelated material in a large and broad subreddit. Our interviewees often expressed that larger subreddits are simply not the best places for enthusiasts to have discussions:
+
+
+
+\blockquote[C2]{I see this background noise problem building [in] \texttt{r/climbing}, the main climbing community, [which] has just become less and less and less interesting and less relevant as it’s gotten bigger. That’s not really a problem. Right? That’s probably has more to do with my interest level and how long I’ve been on it. And my experience level with climbing. I'm just a little bit more crusty about it, you know?
+}
+
+\noindent C2 describes losing interest in the primary subreddit about climbing as it grew because of the interviewee's specific interest in particular types of climbing content (i.e., material associated with being ``crusty'' or experienced). C2 recognizes that when \texttt{r/climbing} experienced growth, the larger volume of posts by newcomers to the sport created a ``background noise problem'' that made it difficult for established climbers to find discussions of interest. 
+
+
+
+
+Similarly, smaller subreddits can be incredibly valuable to those looking for highly specialized information.  Even though they may have very low levels of activity, they can provide a way to learn about rare forms of expertise. A participant in our Vintage Audio cluster explained how they might seek out advice on building a reel-to-reel audio setup: 
+
+\blockquote[V2]{
+If you're at [\texttt{r/ReelToReel}]. Everybody is hyper into them. Whereas there's probably overlap with somebody in  \texttt{r/vintageaudio} \ldots\ If I'm like trying to rebuild my reel-to-reel player, I want to talk to \ldots the most knowledgeable person particularly about building reel-to-reel \ldots
+So I know that who I'm talking to is hyper specific to the knowledge I want. 
+}
+
+\noindent Invoking \texttt{r/ReelToReel}, V2 describes a highly niche subreddit about archaic audio tape equipment with only 3,200 subscribers and a handful of posts each day. V2 is simply not looking to find a large audience. Instead, they want access to the ``most knowledgeable person'' with specific expertise because access to this expertise makes it possible for them to consider doing their own reel-to-reel projects. 
+
+Although the \texttt{r/ReelToReel} community overlaps with the larger and more general \texttt{r\Slash vintageaudio}, the latter does not provide the ability to connect with a small group of expert enthusiasts in an old-fashioned technology. 
+
+Similarly, when someone wants a podcast recommendation tailored to their personal tastes, asking in a larger subreddit is not likely to prove as fruitful as it is within a smaller one. O2, a participant in the Podcasting cluster explained:
+
+\blockquote[O2]{So I think for like \texttt{r/audiodrama}, I would probably write a longer post, and probably get a bit more into like, my personal tastes. Like I would comment about, `oh, I really love the acting in this one, is there anything similar?'
+Open up a bit more about what I do and don’t like. Whereas I think in podcasts, it probably would be more  direct. I’d ask a specific question \ldots\ more to the point, more factual, probably just more almost transactional.}
+
+\noindent Although the larger \texttt{r/podcasts} subreddit is a popular place to promote podcasts on Reddit, O2 explains that they prefer asking for recommendations in the smaller \texttt{r/audiodrama} where they find others willing to take their personal tastes into account. Our interviewees did not advance a ``smaller is better'' argument. O2 explains that they still engage in larger subreddits but use a more direct and transactional approach to information exchange when they do. Similarly, large art communities provide opportunities to find a large audience, but someone can find more substantive feedback to improve their skills, by posting in a smaller subreddit organized specifically for this purpose.
+
+
+
+Interviewees described the most general interest-based subreddits such as \texttt{r/podcasts}, \texttt{r/painting}, and \texttt{r/climbing} as more accessible and welcoming to newcomers and as reaching a larger audience: all things they valued. They also described these larger groups as having a high volume of low-effort posts or comments. 
+Our interviewees explained that although they play a useful role in an information ecosystem, the largest subreddits in a topical area are rarely the best places to look for information or advice.
+They explained that small subreddits can effectively play host to content, information, and discussion that larger subreddits cannot.  
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+\subsubsection{Homophily is more difficult in larger groups}
+
+Because they have less background noise, smaller subreddits are more likely to provide better opportunities to connect with people who share one's distinctive interests, tastes, and identity.  Smaller subreddits are also better places to find a community because they provide opportunities to have repeated encounters with recognizable others, off-topic discussions, and personal interactions. P4 explained:
+
+
+\blockquote[P4]{Obviously, I want as many people to see my stuff as possible, especially [since I am] trying to establish myself. But at the same time, I do want to build a relationship with any sort of community that I can.}
+
+
+
+
+
+\noindent P4 explained that they participated in multiple communities because they have two goals as an artist. First, they want to find an audience for their artwork to establish their career. Second, they want to build a community with others who share their craft. They felt that they needed to turn to multiple subreddits to satisfy both needs.
+
+Although larger subreddits provide a large potential audience, smaller subreddits were described as being friendlier. Another interviewee from our Painting cluster explains that this is because of how people act differently in large and small subreddits:
+
+\blockquote[P3]{
+I live in the middle of nowhere. And every so often, before the pandemic, I would visit the [large city several hours away]. Now I found there were very polite people, both in [the city] and in [my rural area]. But the tone by which people carried themselves changes in their environment: that's kind of one of the big changing factors. So, in the city, people are in a rush, they're about their business. We don't really have time to chat. 
+\ldots\ 
+The big subreddits might seem unfriendly [but] it’s not that so much. Individual members are impolite or unfriendly. But it’s almost as though people carry themselves differently when we’re in different subreddits.}
+
+\noindent In their extended metaphor, P3 explained that large subreddits are like big cities full of busy people who do not ``have time to chat.''  Evocatively, they described people as behaving differently in large and small subreddits. The very same people who are rude in large subreddits might be friendly in smaller subreddits where people have repeated encounters with one another and have a stronger sense of knowing each other.
+In another quote from the same cluster, P2 described how the small subreddit for Bob Ross-inspired painters, \texttt{r/HappyTrees}, stands out from the larger art subreddits because people know one another and it does not feel anonymous. The tight-knit nature of this community contributes to its utility as a source for feedback.
+
+ 
+
+
+
+
+
+
+
+\subsubsection{Tension between finding specific content and homophily}
+
+A third tension described by our interviewees is that between the desire to connect with similar others and the desire for forms of discussion, content, and feedback that can only be found in diverse groups inclusive of dissimilar others.
+Our interviewees described a range of situations when they sought out dissimilar others. For example, they described beginners seeking to learn from experts and outsiders seeking to learn about other cultures. They also described how subreddits instituted rules to limit or organize content that also interfered with unstructured and off-topic discussions that helped with community building.
+
+For example, although multiple subreddits with overlapping users discuss the same episodes of the TV series \textit{Rupaul's Drag Race}, they have different understandings of events in the show depending on their national identities. D1 explained:
+
+\blockquote[D1]{
+The discussions played out differently on different subreddits. In the Drag Race UK sub there’s a lot more understanding about [a British drag queen] in particular, about where they come from \ldots\ In America we don’t understand how that person is from Worcestershire. }
+
+\noindent D1 explained that the cultural background of one of the drag queens was a subject of discussion in \texttt{r/RPDR\_UK}, the UK drag race subreddit, while the main subreddit, \texttt{r/rupaulsdragrace}, was ``dominated by the American viewpoint.'' 
+
+Our interviewees described a number of subreddits focused on discussing broad topics from a specific national or regional culture context. These cultural communities within a topical area provide a homophilous space for sharing distinctive cultural knowledge and sensibilities.
+The wrinkle is that even for our American interviewee D1, the \texttt{r/RPDR\_UK} subreddit provided an opportunity to enhance their own experience and appreciation of the show by observing and learning from members of another culture.
+In examples such as these, our interviewees explained that communities where like-minded people can share their distinctive appreciation show could provide a source of knowledge for outsiders.
+
+Similarly, Painting participant P2 explained that a group that has a mixture of experts and beginners provides a better learning environment than does a group of beginners alone:
+
+\blockquote[P2]{If you can find a small group, with a small core of people who are particularly skilled, they sort of energize the group as a whole. \texttt{r/HappyTrees}, even though it's kind of a beginner subreddit, there's some people that posts there that are like, you know, Bob Ross instructors, or they've been doing this for years. And they've mastered that sort of \ldots\ ``happy trees'' thing.
+}
+
+\noindent P2 explains that part of what makes \texttt{r/HappyTrees} great is that it connects learners to experts. A homogenous subreddit of only beginners or experts would not provide the same opportunities.
+
+
+
+To stay focused on specific types of content, subreddit moderators will frequently employ strict rules and heavy-handed moderation. 
+Our respondents explained that smaller subreddits can get by with fewer rules and lighter moderation because they have fewer behavior problems and are less attractive to toxic outsiders. They are also more able to self-police using Reddit's voting system and through direct interpersonal sanctions such as admonition. In the words of one of the Vintage Audio participants,
+
+\blockquote[V2]{In Reddit, the more users you get, the more strict the rules, and the more strict the moderation. Just to prevent problems.
+}
+
+\noindent V2 continued and explained that when a subreddit is small enough that you can ``wrap your hands around'' and is built around a ``like-minded'' group, it can develop and enforce shared behavioral norms that substitute for formal rules and rigid enforcement regimes. V2 explained that the processes of creating spaces for specific types of information got in the way of building community. 
+
+Similarly, one of our interviewees described \texttt{r/Indieheads}'s rules limiting how often one can post, requiring specific titles and tags, and prohibiting types of user-generated content. Although these rules help maintain a high-quality feed, they also prevent sharing of more personal and relatable forms of content such as amateur performances and chit-chat. As a result, subreddits that make rules to ensure that posts are on-topic frequently have adjacent ``-jerk'' subreddits that provide an outlet for jokes and memes and act as places where off-topic discussions can thrive.
+
+
+
+
+
+
+
+
+
+\subsection{Interviewee's Understandings of Competition and Mutualism}
+\label{sec:results.competition}
+
+
+
+
+Except for a small qualitative subpart of a single paper \citep{zhu_selecting_2014}, prior ecological studies in social computing have relied on concepts such as competition and mutualism but have provided limited evidence that such concepts are salient to participants. 
+As part of our interviews, we asked our interviewees if they perceived relationships between the communities they participated in to be competitive or mutualistic.  In some cases, interviewees imagined hypothetical scenarios where competition might emerge from the perspective of subreddit moderators. For example, a participant in Climbing said:
+
+\blockquote[C1]{I guess if you put your Reddit [moderator status] on your resume or something, and you want to be a moderator of a larger community, you could try to get users from other communities. But I haven’t seen or experienced competition.}
+
+\noindent Although we asked nearly every interviewee about competition, only one interviewee (S2) described an actual instance of conflict or direct competition.
+In nearly every other interview, our subjects found our suggestion that subreddits might be in competition to be surprising and strange.
+
+However, the idea that communities are complementary and mutualistic was much more intuitive. One Vintage Audio participant explained the relationship between subreddits:
+
+
+
+
+
+\blockquote[V2]{Yeah, the overlapping. \ldots\ They each have their own niche. \ldots\ They get big enough to have super critical mass of people. Then they'll have a reason to exist. And then they'll sort of fit into the ecosystem of different communities.}
+
+\noindent Consonant with this description of subreddits in unproblematic coexistence, our interviewees repeatedly suggested that there were not meaningful structural or technical limitations on the number of subreddits a user can join and this reduced the possibility of competition, if it did not eliminate it altogether. 
+
+ \section{Discussion}
+
+
+\label{sec:discussion.trillemma}
+\begin{figure}
+\centering
+\def\firstcircle{(0,0) circle (1.7cm)}
+\def\secondcircle{(60:2.6cm) circle (1.7cm)}
+\def\thirdcircle{(0:2.6cm) circle (1.7cm)}
+
+\definecolor{myyellow}{HTML}{fae772}
+\definecolor{mygreen}{HTML}{4ac26c}
+\definecolor{mypurple}{HTML}{31668c}
+\begin{tikzpicture}
+    \begin{scope}[shift={(3cm,-5cm)}, fill opacity=1, text width=2cm, text centered]
+
+
+\draw \firstcircle node [xshift=-1ex] {Specific Content};
+        \draw \secondcircle node [yshift=1ex] {Largest Possible Audience};
+        \draw \thirdcircle node [xshift=1ex] {Homopilous \\ Community};
+        \node (A) at (0.6,1.2) {A};
+        \node (A) at (2,1.2) {B};
+        \node (A) at (1.325,0.75) {D};
+        \node (A) at (1.325,0) {C};
+
+\end{scope}
+\end{tikzpicture}
+
+    \caption{Venn diagram illustrating the specificity-homophily-audience ``trilemma.''}
+    \label{fig:trilemma}
+\end{figure}
+
+
+The tensions between the benefits that our interviewees sought can be thought of as forming a ``trilemma'' between finding specific content, homophily, and finding as large an audience as possible. This three-way dilemma captures the fact that the more a subreddit succeeds in providing any one of these benefits, the less able it will be able to provide the others. A portfolio of overlapping communities solves this problem by providing all three types of benefits.
+
+
+
+Figure \ref{fig:trilemma} visualizes the theorized trilemma. Each of the benefits described in §\ref{sec:benefits} is reflected in large circles. Each of the tensions described in §\ref{sec:tradeoffs} is reflected in the overlapping areas in the figure.
+Area A contains communities that provide the largest possible audience and specific content but are unlikely to provide homophily to community members. Subreddits that provide large audiences face ``the background noise problem'' as a large volume of submissions makes it difficult for people to find the specific content they care about.
+Area B contains communities that offer both large audiences and homophily but that will struggle to provide specific content. For example, an American interested in learning about international drag culture finds the need to search beyond \texttt{r/rupaulsdragrace}.
+Area C contains communities that provide specialized content and a homophilous community but that may not attract large audiences.
+Although not everyone who desires a specific type of content may be similar to those who produce the content, smaller subreddits can often provide both desired content and opportunities to socialize with similar others. 
+However, as the size of the audience increases, subreddits encounter the background noise problem and acquire a ``big city'' air of unfriendliness. 
+
+
+
+
+
+\subsection{Connections to Prior Research} 
+
+\subsubsection{Finding specific content}
+
+Our findings are consonant with prior work that the primary benefits provided by online communities stem from their power to connect people to novel and hard-to-find sources of information \citep{benkler_wealth_2006, campbell_thousands_2016, von_hippel_free_2016,fiesler_growing_2017}. 
+Our study adds to this work and complements recent findings of \citet{hwang_why_2021} by describing how nested and overlapping online communities are useful for information seeking and managing one's information exposure.  Individuals often desire multiple types of content within a general subject area such as spoiled and spoiler-free discussions.  
+Even when a relatively obscure community such as \texttt{r/vintageaudio} exists, an even more specialized community such as \texttt{r/ReelToReel} may provide access to an even more specialized set of experts. 
+
+
+\subsubsection{Finding homophilous community}
+Prior work has recognized the importance of homophily in motivating and structuring participation in online communities \citep{chang_specialization_2014, cunha_are_2019, grevet_managing_2014}. 
+Contributing to this line of research, we identified a number of types of homophily that drive an individual's decisions to participate. These included hobbies, expertise, age, national culture, identity, and status. Homophily was in tension with the need for specific content in that differences among many of these dimensions were valuable for finding information.
+
+Our results suggest that participants in online communities face trade-offs between homophily and information novelty. These may be similar in structure to the trade-offs between short and long ties observed in contexts such as work groups \citep{ruef_structure_2003} and social networks \citep{grevet_managing_2014, granovetter_strength_1973}. One advantage of joining a group of overlapping online communities is that it can help find information that would be unavailable in homophilous groups. 
+
+\subsubsection{Finding the largest possible audience}
+
+Much social computing research points to the benefits of large audiences and large communities \citep{kraut_building_2012}. Our work adds more evidence to back up those claims. More relevant, perhaps, are recent counterclaims about the benefits of smallness. \citet{hwang_why_2021} presents an interview study with members of small Reddit communities. Although our results about the tensions between large audience size and other benefits are fully in line with Hwang and Foote's findings, our starting assumptions and ultimate takeaways are quite different. \citeauthor{hwang_why_2021} seek to understand why people participate in persistently small communities and conclude that smallness offers a range of benefits. Our results suggest that individuals seek out benefits that happen to be incompatible with largeness and participate in portfolios of communities that, because of the trilemma we described, will almost certainly include small ones. Although we believe that \citepos{hwang_why_2021} emphasis on smallness might draw focus to a side effect instead of the cause, we believe that the findings in our two papers are largely complementary.
+
+Although users may desire large audiences, large online communities often require additional structure to maintain order\citep{kiene_surviving_2016, gillespie_custodians_2018, kiene_technological_2019}. \citet{kiene_surviving_2016} describes how a massive influx of newcomers presents difficulties that can be managed by appointing additional moderators, increasing norm enforcement, and limiting the frequency of posts. \citet{lin_better_2017} find that such interventions help subreddits maintain comment quality and stay on topic during massive influxes of growth. Our sense is that these changes ensure the availability of specific content, in part, because of the growth-limiting effects of rules and enforcement \citep{halfaker_rise_2013, teblunthuis_revisiting_2018}. We see this as yet more evidence in favor of our theory. 
+
+
+
+
+\subsection{Implications for Ecological Studies in Social Computing}
+
+
+The quote by V2 in §\ref{sec:results.competition} can be read as a kind of summary of resource partitioning theory (RPT), a strand of ecological research in organizational science that focuses on explaining specialization \citep{carroll_concentration_1985}. Although RPT has not been deeply examined in prior social computing work, our findings suggest that it may be able to explain the widespread occurrence of overlapping communities. RPT proposes that the reason that small specialized organizations coexist with large generalist organizations is that generalists are constrained in their ability to meet distinctive needs in niche markets \citep{carroll_why_2000, swaminathan_resource_2001}. In V2's terms, the ``ecosystem of different communities'' is constructed by a process in which those that ``have a reason to exist'' and are specialized to ``have their own niche'' will achieve ``critical mass.''
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Our grounded theory suggests that the trade-offs in the capacity of an online community to provide different types of benefits that people seek from online communities give rise to new niches.
+On the basis of our findings and our understanding of RPT, we hypothesize the following process to describe how systems of overlapping communities develop:
+
+
+
+
+
+
+
+
+When a new topical area grows, the bulk of activity will happen in a generalist community. New members joining that community may seek and find the perceived benefits described in §\ref{sec:benefits} (i.e., specific kinds of content, homophily, and the largest possible audience). 
+If a topic area, such as art, is sufficiently general, initial membership growth occurs as the community attracts new and existing users interested in both general and more specific types of content. 
+
+As growth continues, membership in the generalist community becomes heterogeneous with lower levels of homophily (e.g., amateur and professional artists) and more specific interests (e.g., painters and photographers) and types of engagement desired (e.g., attention from an audience or critique). At this point, the trade-offs we discuss in §4.2 related to size become relevant. Finding information related to a specialized subtopic and homophilous socializing grows difficult. 
+
+
+
+If, as with Reddit, creating new communities is low cost, a community specialized in a subtopic can emerge.
+This specialized community will likely not attract as large an audience as the generalist community. However, those most interested in the specific subtopic will join it to escape what our interviewees describe as ``background noise'' in the larger generalist community.
+Similarly, those seeking personal interaction or social bonding with other community members will be more likely to find them in the specialized community.
+A similar process occurs in the formation of spaces having different rules or purposes (such as ``jerk'' spaces). 
+The cycle will then begin anew as subreddits repartition a subtopic such as \texttt{r/painting} into subspecialists such as \texttt{r/oilpainting} and \texttt{r/watercolor}.
+Although some of our interviewees described parts of this process, the model we have narrated is an untested theory.
+We leave it to future work to establish its empirical validity. 
+
+
+
+
+
+
+
+
+
+\subsection{Implications for Design}
+By allowing users to create multiple communities with similar or identical topics, platforms can host ecosystems of online communities capable of providing a larger range of benefits to a larger range of users.
+Some platforms, such as Stack Exchange, prohibit new communities from overlapping with existing communities \citep{fu_knowledge_2016}. 
+Our findings suggest that such rules limit the range of the benefits the platform's communities can confer.
+
+Existing designs for online community platforms such as Reddit are at best ``first-order approximations'' of an ideal solution in that a ``sociotechnical gap'' remains between these designs and the goal of a platform that meets every person's every need \citep{ackerman_intellectual_2000}. Our interviewees partly filled this gap with personalized bespoke solutions in the form of their handpicked portfolios of communities. 
+Improved designs for multi-community discovery and engagement can better support users in knitting together portfolios of communities.
+
+Many Reddit users make heavy use of the aggregated streaming feeds \texttt{r/all} and \texttt{r/popular}, which surface highly upvoted posts from across Reddit.  
+Our interviewees described these feeds as most often featuring content from subreddits that are already extremely popular. Furthermore, Reddit's system for recommending subreddits often returned irrelevant suggestions.
+Suggesting communities in as many cells in Figure \ref{fig:trilemma} as possible could help users build their portfolios of communities.
+Because increased visibility may create stress and labor for communities and moderators \citep{kiene_surviving_2016}, recommendations should target those potential members likely to be positive contributors.
+
+Although some of our interviewees used the ``multireddit'' feature for making a custom feed of subreddits, they described this feature as cumbersome and overwhelming.
+A design alternative is to formalize or even automate the types of informal social practices our interviewees described such as cross-community linking and cross-posting. 
+For example, a subreddit such as \texttt{r/vintageaudio} might configure an auto-moderator to detect posts about reel-to-reel equipment and recommend cross-posting to \texttt{r/reeltoreel}. 
+A discussion-focused subreddit might routinely invite productive contributors to discussions in the related ``main'' subreddit.
+Because intercommunity interactions can give rise to conflict, individual communities should have control of how such practices are implemented.
+New tools for collaboration between moderation teams may enable the institution of policies encouraging productive concurrent participation in overlapping communities.
+
+
+
+\subsection{Limitations}
+
+Our study has limitations common to all interview-based studies. Our findings derive from in-depth conversations with relatively few of the people who were highly active participants in the handful of clusters of communities in our sample. 
+Although our study was designed to achieve analytic saturation within each cluster and to cover a wide range of types of topics discussed on Reddit, additional interviews across a wider range of communities might uncover new types of specialization. Additionally, our interviewees were among the most active members of the clusters, and their experiences may differ from those of peripheral members.
+Similarly, we cannot speak to the experiences of those who participated in only one community within a cluster. 
+
+
+
+Our interview data were collected at one point in time and cannot speak to how the dynamics we describe played out over time or how new communities were created and emerged.
+Relatedly, although we find that overlapping communities tend to provide different benefits to members, we did not set out to interview community founders and thus cannot speak to the reasons that communities were created \citep{foote_starting_2017}. 
+
+Furthermore, our study focuses only on the Reddit platform.   Reddit has distinctive affordances for voting, moderation, and multicommunity engagement that might shape the construction and use of overlapping communities. Although Reddit is among the most popular online community platforms. Our findings may not describe relationships between overlapping communities on other platforms, or between one platform and another. Different platforms likely have different strengths or weaknesses for building communities that provide some types benefits but not others. 
+At the same time, cross-platform engagement may involve frictions related to the use of multiple identities and sociotechnical systems.
+Future research should investigate how people use portfolios that include communities on multiple platforms.
+ 
+\section{Conclusion}
+Why are the same people talking to each other about similar things in different online communities? We answer this question by developing a theory grounded in the analysis of 20 interviews with members of highly related communities on Reddit. Our answer suggests that people turn to online communities in search of multiple benefits---specific kinds of content and discussion, socialization in a homophilous community, and attention from the largest possible audience. We argue that although structures such as the topic, rules, and size of a community might improve the degree to which it provides one of these benefits, they will necessarily detract from its ability to provide others.  Multiple communities having a range of structures exist to provide the full range of benefits. No community can do everything.
+
--- a/dissertations/nathante_uw_2021/equalogy_refs.bib
+++ b/dissertations/nathante_uw_2021/equalogy_refs.bib
--- a/dissertations/nathante_uw_2021/figures/GN_session_device_plot-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/GN_session_device_plot-1.pdf
--- a/dissertations/nathante_uw_2021/figures/calibration-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/calibration-1.pdf
--- a/dissertations/nathante_uw_2021/figures/cod_graphviz.pdf
+++ b/dissertations/nathante_uw_2021/figures/cod_graphviz.pdf
--- a/dissertations/nathante_uw_2021/figures/fig_spacing-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/fig_spacing-1.pdf
--- a/dissertations/nathante_uw_2021/figures/hazardplot-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/hazardplot-1.pdf
--- a/dissertations/nathante_uw_2021/figures/histograms_1-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/histograms_1-1.pdf
--- a/dissertations/nathante_uw_2021/figures/kernelplots-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/kernelplots-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-adoption_me_plot-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-adoption_me_plot-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-fig_densityxgrowth-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-fig_densityxgrowth-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-h1_unreg_me_plot-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-h1_unreg_me_plot-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-h1_userpage_me_plot-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-h1_userpage_me_plot-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-me_plot_H2_anon-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-me_plot_H2_anon-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-me_plot_H2_no_user_page-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-me_plot_H2_no_user_page-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-plot_commense_x_abs_commense-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-plot_commense_x_abs_commense-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-regplot_H1_anon-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-regplot_H1_anon-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-regplot_H3_anon-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-regplot_H3_anon-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-regplot_controversial_anon-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-regplot_controversial_anon-1.pdf
--- a/dissertations/nathante_uw_2021/figures/knitr-regplot_controversial_no_user_page-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/knitr-regplot_controversial_no_user_page-1.pdf
--- a/dissertations/nathante_uw_2021/figures/mental_graphviz.pdf
+++ b/dissertations/nathante_uw_2021/figures/mental_graphviz.pdf
--- a/dissertations/nathante_uw_2021/figures/model1aplot-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/model1aplot-1.pdf
--- a/dissertations/nathante_uw_2021/figures/rcfilters_example_2.png
+++ b/dissertations/nathante_uw_2021/figures/rcfilters_example_2.png
--- a/dissertations/nathante_uw_2021/figures/realestate_graphviz.pdf
+++ b/dissertations/nathante_uw_2021/figures/realestate_graphviz.pdf
--- a/dissertations/nathante_uw_2021/figures/score_correlation-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/score_correlation-1.pdf
--- a/dissertations/nathante_uw_2021/figures/uncertainty-1.pdf
+++ b/dissertations/nathante_uw_2021/figures/uncertainty-1.pdf
--- a/dissertations/nathante_uw_2021/figures/watches_graphviz.pdf
+++ b/dissertations/nathante_uw_2021/figures/watches_graphviz.pdf
--- a/dissertations/nathante_uw_2021/frontmatter.pdf
+++ b/dissertations/nathante_uw_2021/frontmatter.pdf
--- a/dissertations/nathante_uw_2021/ores_fairness.bib
+++ b/dissertations/nathante_uw_2021/ores_fairness.bib
--- a/dissertations/nathante_uw_2021/references.bib
+++ b/dissertations/nathante_uw_2021/references.bib
--- a/dissertations/nathante_uw_2021/refs.bib
+++ b/dissertations/nathante_uw_2021/refs.bib
--- a/dissertations/nathante_uw_2021/resources/network-figures.tex
+++ b/dissertations/nathante_uw_2021/resources/network-figures.tex
@ -0,0 +1 @@
+/home/nathante/partitioning_reddit/diss_paper/resources/network-figures.tex
--- a/dissertations/nathante_uw_2021/title_page.pdf
+++ b/dissertations/nathante_uw_2021/title_page.pdf
--- a/irb_uw/euds_interviews-20210618/[EUDS
+++ b/irb_uw/euds_interviews-20210618/[EUDS
--- a/irb_uw/euds_interviews-20210618/[EUDS
+++ b/irb_uw/euds_interviews-20210618/[EUDS
--- a/irb_uw/euds_interviews-20210618/[EUDS
+++ b/irb_uw/euds_interviews-20210618/[EUDS
--- a/irb_uw/euds_interviews-20210618/[EUDS
+++ b/irb_uw/euds_interviews-20210618/[EUDS
--- a/irb_uw/euds_interviews-20210618/[EUDS
+++ b/irb_uw/euds_interviews-20210618/[EUDS
--- a/irb_uw/india_wikipedia_interviews-20191107/[India
+++ b/irb_uw/india_wikipedia_interviews-20191107/[India
--- a/irb_uw/india_wikipedia_interviews-20191107/[India
+++ b/irb_uw/india_wikipedia_interviews-20191107/[India
--- a/irb_uw/india_wikipedia_interviews-20191107/[India
+++ b/irb_uw/india_wikipedia_interviews-20191107/[India
--- a/irb_uw/india_wikipedia_interviews-20191107/[India
+++ b/irb_uw/india_wikipedia_interviews-20191107/[India
--- a/irb_uw/india_wikipedia_interviews-20191107/[India
+++ b/irb_uw/india_wikipedia_interviews-20191107/[India
				`@ -0,0 +1 @@`
				`/home/nathante/partitioning_reddit/diss_paper/resources/network-figures.tex`