diff --git a/cscw_changelogs/2017-the_wikipedia_adventure/README.txt b/cscw_changelogs/2017-the_wikipedia_adventure/README.txt new file mode 100644 index 0000000..0edf3c9 --- /dev/null +++ b/cscw_changelogs/2017-the_wikipedia_adventure/README.txt @@ -0,0 +1,7 @@ +Material for paper: + +Narayan, Sneha, Jake Orlowitz, Jonathan Morgan, Benjamin Mako Hill, and Aaron +Shaw. 2017. “The Wikipedia Adventure: Field Evaluation of an Interactive +Tutorial for New Users.” In Proceedings of the 20th ACM Conference on +Computer-Supported Cooperative Work & Social Computing (CSCW ’17). New York, +New York: ACM. https://doi.org/10.1145/2998181.2998307 diff --git a/cscw_changelogs/2017-the_wikipedia_adventure/refs-processed.bib b/cscw_changelogs/2017-the_wikipedia_adventure/refs-processed.bib new file mode 100644 index 0000000..f2da457 --- /dev/null +++ b/cscw_changelogs/2017-the_wikipedia_adventure/refs-processed.bib @@ -0,0 +1,1088 @@ + +@inproceedings{huang_how_2015, + address = {New York, NY, USA}, + series = {{CHI} '15}, + title = {How {Activists} {Are} {Both} {Born} and {Made}: {An} {Analysis} of {Users} on {Change}.{Org}}, + isbn = {978-1-4503-3145-6}, + shorttitle = {How {Activists} {Are} {Both} {Born} and {Made}}, + url = {http://doi.acm.org/10.1145/2702123.2702559}, + doi = {10.1145/2702123.2702559}, + urldate = {2016-01-20}, + booktitle = {Proceedings of the 33rd {Annual} {ACM} {Conference} on {Human} {Factors} in {Computing} {Systems}}, + publisher = {ACM}, + author = {Huang, Shih-Wen and Suh, Minhyang (Mia) and Hill, Benjamin Mako and Hsieh, Gary}, + year = {2015}, + keywords = {civic engagement, contribution, e-petition, motivation, online activism, power user}, + pages = {211--220} +} + +@inproceedings{kittur_herding_2009, + address = {New York, NY, USA}, + series = {{WikiSym} '09}, + title = {Herding the {Cats}: {The} {Influence} of {Groups} in {Coordinating} {Peer} {Production}}, + isbn = {978-1-60558-730-1}, + shorttitle = {Herding the {Cats}}, + url = {http://doi.acm.org/10.1145/1641309.1641321}, + doi = {10.1145/1641309.1641321}, + urldate = {2016-08-08}, + booktitle = {Proceedings of the 5th {International} {Symposium} on {Wikis} and {Open} {Collaboration}}, + publisher = {ACM}, + author = {Kittur, Aniket and Pendleton, Bryan and Kraut, Robert E.}, + year = {2009}, + keywords = {coordination, groups, organizational citizenship behavior, peer production, self-identification, wikipedia}, + pages = {7:1--7:9} +} + +@article{dejean_big_2015, + title = {Big from the beginning: {Assessing} online contributors' behavior by their first contribution}, + volume = {44}, + issn = {0048-7333}, + shorttitle = {Big from the beginning}, + url = {http://www.sciencedirect.com/science/article/pii/S0048733315000360}, + doi = {10.1016/j.respol.2015.03.001}, + number = {6}, + urldate = {2016-08-09}, + journal = {Research Policy}, + author = {Dejean, Sylvain and Jullien, Nicolas}, + month = jul, + year = {2015}, + keywords = {Econometric studies, Heckman, Open online communities, survey, Voluntary participation, wikipedia}, + pages = {1226--1239} +} + +@inproceedings{arazy_functional_2015, + address = {New York, NY, USA}, + series = {{CSCW} '15}, + title = {Functional {Roles} and {Career} {Paths} in {Wikipedia}}, + isbn = {978-1-4503-2922-4}, + url = {http://doi.acm.org/10.1145/2675133.2675257}, + doi = {10.1145/2675133.2675257}, + urldate = {2015-11-11}, + booktitle = {Proceedings of the 18th {ACM} {Conference} on {Computer} {Supported} {Cooperative} {Work} \& {Social} {Computing}}, + publisher = {ACM}, + author = {Arazy, Ofer and Ortega, Felipe and Nov, Oded and Yeo, Lisa and Balila, Adam}, + year = {2015}, + keywords = {functional roles, ORGANIZATIONAL structure, peer-production, role transitions, wikipedia}, + pages = {1092--1105} +} + +@inproceedings{lampe_motivations_2010, + address = {Atlanta, Georgia, USA}, + title = {Motivations to participate in online communities}, + isbn = {978-1-60558-929-9}, + url = {http://portal.acm.org.libproxy.mit.edu/citation.cfm?id=1753326.1753616}, + doi = {10.1145/1753326.1753616}, + urldate = {2010-10-26}, + booktitle = {Proceedings of the 28th international conference on {Human} factors in computing systems}, + publisher = {ACM}, + author = {Lampe, Cliff and Wash, Rick and Velasquez, Alcides and Ozkaya, Elif}, + year = {2010}, + keywords = {lurkers, motivation, Online Communities, peripheral participation}, + pages = {1927--1936} +} + +@book{kraut_building_2012, + address = {Cambridge, MA}, + title = {Building {Successful} {Online} {Communities}: {Evidence}-{Based} {Social} {Design}}, + isbn = {0-262-01657-5}, + shorttitle = {Building {Successful} {Online} {Communities}}, + publisher = {The MIT Press}, + author = {Kraut, Robert E. and Resnick, Paul}, + collaborator = {Kiesler, Sara and Burke, Moira and Chen, Yan and Kittur, Niki and Konstan, Joseph and Ren, Yuqing and Riedl, John}, + month = mar, + year = {2012}, + keywords = {Computer networks, Internet, Online social networks, Planning, SOCIAL aspects, Social aspects Planning, Social psychology} +} + +@incollection{rafaeli_online_2008, + title = {Online {Motivational} {Factors}: {Incentives} for {Participation} and {Contribution} in {Wikipedia}}, + isbn = {978-1-139-47017-9}, + language = {en}, + booktitle = {Psychological {Aspects} of {Cyberspace}: {Theory}, {Research}, {Applications}}, + publisher = {Cambridge University Press}, + author = {Rafaeli, Sheizaf and Ariel, Yaron}, + editor = {Barak, Azy}, + month = may, + year = {2008}, + keywords = {Computers / Internet / General, Language Arts \& Disciplines / Communication Studies, Psychology / General, Psychology / Social Psychology} +} + +@inproceedings{halfaker_making_2013, + address = {New York, NY, USA}, + series = {{CSCW} '13}, + title = {Making peripheral participation legitimate: reader engagement experiments in wikipedia}, + isbn = {978-1-4503-1331-5}, + shorttitle = {Making peripheral participation legitimate}, + url = {http://doi.acm.org/10.1145/2441776.2441872}, + doi = {10.1145/2441776.2441872}, + urldate = {2013-07-06}, + booktitle = {Proceedings of the 2013 conference on {Computer} supported cooperative work}, + publisher = {ACM}, + author = {Halfaker, Aaron and Keyes, Oliver and Taraborelli, Dario}, + year = {2013}, + keywords = {Experiment, legitimate peripheral participation, open production, participation, quatitative, social learning, wikipedia}, + pages = {849--860} +} + +@inproceedings{antin_technology-mediated_2012, + address = {New York, NY, USA}, + series = {{CSCW} '12}, + title = {Technology-mediated contributions: editing behaviors among new wikipedians}, + isbn = {978-1-4503-1086-4}, + shorttitle = {Technology-mediated contributions}, + url = {http://doi.acm.org/10.1145/2145204.2145264}, + doi = {10.1145/2145204.2145264}, + urldate = {2013-07-06}, + booktitle = {Proceedings of the {ACM} 2012 conference on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Antin, Judd and Cheshire, Coye and Nov, Oded}, + year = {2012}, + keywords = {legitimate peripheral participation., wikipedia, wiki-work}, + pages = {373--382} +} + +@article{halfaker_rise_2013, + title = {The {Rise} and {Decline} of an {Open} {Collaboration} {System} {How} {Wikipedia}'s {Reaction} to {Popularity} {Is} {Causing} {Its} {Decline}}, + volume = {57}, + issn = {0002-7642, 1552-3381}, + url = {http://abs.sagepub.com.libproxy.mit.edu/content/57/5/664}, + doi = {10.1177/0002764212469365}, + language = {en}, + number = {5}, + urldate = {2013-07-07}, + journal = {American Behavioral Scientist}, + author = {Halfaker, Aaron and Geiger, R. Stuart and Morgan, Jonathan T. and Riedl, John}, + month = may, + year = {2013}, + keywords = {governance, peer production, quality control, retention, wikipedia}, + pages = {664--688} +} + +@inproceedings{kriplean_articulations_2008, + address = {San Diego, CA, USA}, + title = {Articulations of wikiwork: uncovering valued work in wikipedia through barnstars}, + isbn = {978-1-60558-007-4}, + shorttitle = {Articulations of wikiwork}, + url = {http://portal.acm.org/citation.cfm?id=1460563.1460573}, + doi = {10.1145/1460563.1460573}, + urldate = {2010-06-24}, + booktitle = {Proceedings of the 2008 {ACM} conference on {Computer} supported cooperative work ({CSCW}2008)}, + publisher = {ACM}, + author = {Kriplean, Travis and Beschastnikh, Ivan and McDonald, David W.}, + year = {2008}, + keywords = {articulation work, barnstars, commons-based peer production, online community, wikipedia}, + pages = {47--56} +} + +@book{reagle_good_2010, + address = {Cambridge Mass.}, + title = {Good {Faith} {Collaboration}: {The} {Culture} of {Wikipedia}}, + isbn = {978-0-262-01447-2}, + shorttitle = {Good faith collaboration}, + publisher = {MIT Press}, + author = {Reagle, Joseph}, + year = {2010} +} + +@inproceedings{welser_finding_2011, + address = {New York, NY, USA}, + series = {{iConference} '11}, + title = {Finding social roles in {Wikipedia}}, + isbn = {978-1-4503-0121-3}, + url = {http://doi.acm.org/10.1145/1940761.1940778}, + doi = {10.1145/1940761.1940778}, + urldate = {2012-02-24}, + booktitle = {Proceedings of the 2011 {iConference}}, + publisher = {ACM}, + author = {Welser, Howard T. and Cosley, Dan and Kossinets, Gueorgi and Lin, Austin and Dokshin, Fedor and Gay, Geri and Smith, Marc}, + year = {2011}, + keywords = {online community, Social networks, social roles, structural signatures, wikipedia}, + pages = {122--129} +} + +@book{deci_intrinsic_1985, + series = {Perspectives in {Social} {Psychology}}, + title = {Intrinsic {Motivation} and {Self}-{Determination} in {Human} {Behavior}}, + isbn = {0-306-42022-8}, + publisher = {Plenum Press}, + author = {Deci, Edward L. and Ryan, Richard M.}, + month = aug, + year = {1985} +} + +@article{frey_motivation_2001, + title = {Motivation {Crowding} {Theory}}, + volume = {15}, + url = {http://dx.doi.org/10.1111/1467-6419.00150}, + doi = {10.1111/1467-6419.00150}, + number = {5}, + urldate = {2008-08-26}, + journal = {Journal of Economic Surveys}, + author = {Frey, Bruno S. and Jegen, Reto}, + year = {2001}, + keywords = {Economics, Volunteers}, + pages = {589--611} +} + +@article{zhang_group_2010, + title = {Group size and incentives to contribute: {A} natural experiment at {Chinese} {Wikipedia}}, + volume = {101}, + journal = {American Economic Review}, + author = {Zhang, Xiaoquan (Michael) and Zhu, Feng}, + year = {2010}, + pages = {1601--1615} +} + +@inproceedings{panciera_wikipedians_2009, + address = {New York, NY, USA}, + series = {{GROUP} '09}, + title = {Wikipedians are born, not made: a study of power editors on {Wikipedia}}, + isbn = {978-1-60558-500-0}, + shorttitle = {Wikipedians are born, not made}, + url = {http://doi.acm.org/10.1145/1531674.1531682}, + doi = {10.1145/1531674.1531682}, + urldate = {2012-06-14}, + booktitle = {Proceedings of the {ACM} 2009 international conference on {Supporting} group work}, + publisher = {ACM}, + author = {Panciera, Katherine and Halfaker, Aaron and Terveen, Loren}, + year = {2009}, + keywords = {collaboration, contribution, power editors, Wiki, wikipedia}, + pages = {51--60} +} + +@article{benkler_coases_2002, + title = {Coase's penguin, or, {Linux} and the nature of the firm}, + volume = {112}, + url = {http://yalelawjournal.org/112/3/369_yochai_benkler.html}, + number = {3}, + urldate = {2008-09-14}, + journal = {Yale Law Journal}, + author = {Benkler, Yochai}, + year = {2002}, + keywords = {Advantages, Economics, FOSS, internet, Law, Legal Studies, Open source software, Production cooperatives, Socioeconomic factors}, + pages = {369--446} +} + +@inproceedings{halfaker_dont_2011, + address = {New York, NY, USA}, + series = {{WikiSym} '11}, + title = {Don't bite the newbies: how reverts affect the quantity and quality of {Wikipedia} work}, + isbn = {978-1-4503-0909-7}, + shorttitle = {Don't bite the newbies}, + url = {http://doi.acm.org/10.1145/2038558.2038585}, + doi = {10.1145/2038558.2038585}, + urldate = {2013-03-06}, + booktitle = {Proceedings of the 7th {International} {Symposium} on {Wikis} and {Open} {Collaboration}}, + publisher = {ACM}, + author = {Halfaker, Aaron and Kittur, Aniket and Riedl, John}, + year = {2011}, + keywords = {experience, motivation, productivity, quality, revert, wikipedia, WikiWork}, + pages = {163--172} +} + +@inproceedings{zhu_organizing_2012, + address = {New York, NY, USA}, + series = {{CSCW} '12}, + title = {Organizing without formal organization: group identification, goal setting and social modeling in directing online production}, + isbn = {978-1-4503-1086-4}, + shorttitle = {Organizing without formal organization}, + url = {http://doi.acm.org/10.1145/2145204.2145344}, + doi = {10.1145/2145204.2145344}, + urldate = {2012-05-10}, + booktitle = {Proceedings of the {ACM} 2012 conference on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Zhu, Haiyi and Kraut, Robert and Kittur, Aniket}, + year = {2012}, + keywords = {directing behaviors, governance mechanisms, group goals, group identification, online production communities}, + pages = {935--944} +} + +@inproceedings{morgan_tea_2013, + address = {New York, NY, USA}, + series = {{CSCW} '13}, + title = {Tea and sympathy: crafting positive new user experiences on wikipedia}, + isbn = {978-1-4503-1331-5}, + shorttitle = {Tea and sympathy}, + doi = {10.1145/2441776.2441871}, + urldate = {2013-04-01}, + booktitle = {Proceedings of the 2013 conference on {Computer} supported cooperative work}, + publisher = {ACM}, + author = {Morgan, Jonathan T. and Bouterse, Siko and Walls, Heather and Stierch, Sarah}, + year = {2013}, + keywords = {collaboration, Gender, new users, socialization, user experience, wikipedia}, + pages = {839--848} +} + +@inproceedings{collier_conflict_2012, + address = {New York, NY, USA}, + series = {{CSCW} '12}, + title = {Conflict, criticism, or confidence: an empirical examination of the gender gap in wikipedia contributions}, + isbn = {978-1-4503-1086-4}, + shorttitle = {Conflict, criticism, or confidence}, + doi = {10.1145/2145204.2145265}, + urldate = {2013-04-01}, + booktitle = {Proceedings of the {ACM} 2012 conference on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Collier, Benjamin and Bear, Julia}, + year = {2012}, + keywords = {confidence, conflict, criticism, Gender, survey, wikipedia}, + pages = {383--392} +} + +@article{restivo_no_2014, + title = {No praise without effort: experimental evidence on how rewards affect {Wikipedia}'s contributor community}, + volume = {17}, + issn = {1369-118X}, + shorttitle = {No praise without effort}, + url = {http://www.tandfonline.com/doi/abs/10.1080/1369118X.2014.888459}, + doi = {10.1080/1369118X.2014.888459}, + number = {4}, + urldate = {2014-03-25}, + journal = {Information, Communication \& Society}, + author = {Restivo, Michael and van de Rijt, Arnout}, + year = {2014}, + pages = {1--12} +} + +@book{murnane_methods_2011, + address = {Oxford ; New York}, + title = {Methods {Matter}: {Improving} {Causal} {Inference} in {Educational} and {Social} {Science} {Research}}, + isbn = {978-0-19-975386-4}, + shorttitle = {Methods matter}, + publisher = {Oxford University Press}, + author = {Murnane, Richard J. and Willett, John B.}, + year = {2011}, + keywords = {Education, Quantitative research, research methodology} +} + +@inproceedings{adler_content-driven_2007, + address = {New York, NY, USA}, + series = {{WWW} '07}, + title = {A {Content}-driven {Reputation} {System} for the {Wikipedia}}, + isbn = {978-1-59593-654-7}, + url = {http://doi.acm.org/10.1145/1242572.1242608}, + doi = {10.1145/1242572.1242608}, + urldate = {2014-05-08}, + booktitle = {Proceedings of the 16th {International} {Conference} on {World} {Wide} {Web}}, + publisher = {ACM}, + author = {Adler, B. Thomas and de Alfaro, Luca}, + year = {2007}, + keywords = {reputation, user-generated content, wikipedia}, + pages = {261--270} +} + +@incollection{benkler_peer_2015, + address = {Cambridge, MA}, + title = {Peer {Production}: {A} {Form} of {Collective} {Intelligence}}, + booktitle = {The {Handbook} of {Collective} {Intelligence}}, + publisher = {MIT Press}, + author = {Benkler, Yochai and Shaw, Aaron and Hill, Benjamin Mako}, + editor = {Bernstein, Michael and Malone, Thomas}, + year = {2015} +} + +@inproceedings{adler_assigning_2008, + address = {New York, NY, USA}, + series = {{WikiSym} '08}, + title = {Assigning {Trust} to {Wikipedia} {Content}}, + isbn = {978-1-60558-128-6}, + url = {http://doi.acm.org/10.1145/1822258.1822293}, + doi = {10.1145/1822258.1822293}, + urldate = {2014-11-01}, + booktitle = {Proceedings of the 4th {International} {Symposium} on {Wikis}}, + publisher = {ACM}, + author = {Adler, B. Thomas and Chatterjee, Krishnendu and de Alfaro, Luca and Faella, Marco and Pye, Ian and Raman, Vishwanath}, + year = {2008}, + pages = {26:1--26:12} +} + +@book{lave_situated_1991, + address = {Cambridge, UK}, + title = {Situated {Learning}: {Legitimate} {Peripheral} {Participation}}, + isbn = {978-0-521-42374-8}, + shorttitle = {Situated {Learning}}, + language = {en}, + publisher = {Cambridge University Press}, + author = {Lave, Jean and Wenger, Etienne}, + month = sep, + year = {1991}, + keywords = {Education / Educational Psychology, Psychology / Cognitive Psychology \& Cognition, Psychology / Developmental / General, Psychology / General, Psychology / Personality} +} + +@inproceedings{choi_socialization_2010, + title = {Socialization tactics in wikipedia and their effects}, + booktitle = {Proceedings of the 2010 {ACM} {Conference} on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Choi, Boreum and Alexander, Kira and Kraut, Robert E and Levine, John M}, + year = {2010}, + pages = {107--116} +} + +@book{rosenbaum_design_2010, + address = {New York}, + series = {Springer series in statistics}, + title = {Design of {Observational} {Studies}}, + isbn = {978-1-4419-1213-8}, + urldate = {2012-10-15}, + publisher = {Springer}, + author = {Rosenbaum, Paul R.}, + year = {2010}, + keywords = {analysis of variance, Beobachtungsstudie, Experimental design} +} + +@techreport{angrist_does_1990, + title = {Does compulsory school attendance affect schooling and earnings?}, + institution = {National Bureau of Economic Research}, + author = {Angrist, Joshua D and Krueger, Alan B}, + year = {1990} +} + +@article{dee_are_2004, + title = {Are there civic returns to education?}, + volume = {88}, + number = {9}, + journal = {Journal of Public Economics}, + author = {Dee, Thomas S}, + year = {2004}, + pages = {1697--1720} +} + +@article{allen_organizational_2006, + title = {Do organizational socialization tactics influence newcomer embeddedness and turnover?}, + volume = {32}, + number = {2}, + journal = {Journal of Management}, + author = {Allen, David G}, + year = {2006}, + pages = {237--256} +} + +@article{bauer_newcomer_2007, + title = {Newcomer adjustment during organizational socialization: a meta-analytic review of antecedents, outcomes, and methods.}, + volume = {92}, + number = {3}, + journal = {Journal of applied psychology}, + author = {Bauer, Talya N and Bodner, Todd and Erdogan, Berrin and Truxillo, Donald M and Tucker, Jennifer S}, + year = {2007}, + pages = {707} +} + +@inproceedings{lampe_classroom_2012, + title = {Classroom {Wikipedia} participation effects on future intentions to contribute}, + booktitle = {Proceedings of the {ACM} 2012 conference on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Lampe, Cliff and Obar, Jonathan and Ozkaya, Elif and Zube, Paul and Velasquez, Alcides}, + year = {2012}, + pages = {403--406} +} + +@article{wanous_newcomer_1993, + title = {Newcomer orientation programs that facilitate organizational entry}, + journal = {Personnel selection and assessment: Individual and organizational perspectives}, + author = {Wanous, John P}, + year = {1993}, + pages = {125--139} +} + +@book{angrist_mostly_2008, + address = {Princeton, NJ}, + title = {Mostly {Harmless} {Econometrics}: {An} {Empiricist}'s {Companion}}, + isbn = {978-1-4008-2982-8}, + shorttitle = {Mostly {Harmless} {Econometrics}}, + language = {en}, + publisher = {Princeton University Press}, + author = {Angrist, Joshua D. and Pischke, Jorn-Steffen}, + year = {2008}, + keywords = {Business \& Economics / Econometrics} +} + +@incollection{van_maanen_toward_1979, + address = {Greenwich, CT}, + title = {Toward a theory of organizational socialization}, + booktitle = {Research in organizational behavior}, + publisher = {JAI Press}, + author = {Van Maanen, John and {Schein, Edgar H.}}, + editor = {Staw, Barry M.}, + year = {1979}, + pages = {209--264} +} + +@incollection{kriesi_hanspeter_organizational_1996, + title = {The organizational structure of social movements in a political context}, + booktitle = {Comparative {Perspectives} on {Social} {Movements}}, + publisher = {Cambridge University Press}, + author = {{Kriesi, Hanspeter}}, + year = {1996}, + pages = {152--184} +} + +@article{crowston_kevin_free/libre_2012, + title = {Free/{Libre} open-source software development: {What} we know and what we do not know.}, + volume = {44}, + number = {2}, + journal = {ACM Computing Surveys}, + author = {{Crowston, Kevin} and {Wei, Kangning} and {Howison, James} and {Wiggins, Andrea}}, + year = {2012}, + pages = {7} +} + +@article{shaffer_thick_1999, + title = {"{Thick}" {Authenticity}: {New} {Media} and {Authentic} {Learning}}, + volume = {10}, + shorttitle = {"{Thick}" {Authenticity}}, + url = {http://dl.acm.org/citation.cfm?id=325370.325387}, + number = {2}, + urldate = {2016-08-09}, + journal = {J. Interact. Learn. Res.}, + author = {Shaffer, David Williamson and Resnick, Mitchel}, + month = dec, + year = {1999}, + pages = {195--215} +} + +@inproceedings{antin_readers_2010, + address = {New York, NY, USA}, + series = {{CSCW} '10}, + title = {Readers {Are} {Not} {Free}-riders: {Reading} {As} a {Form} of {Participation} on {Wikipedia}}, + isbn = {978-1-60558-795-0}, + shorttitle = {Readers {Are} {Not} {Free}-riders}, + url = {http://doi.acm.org/10.1145/1718918.1718942}, + doi = {10.1145/1718918.1718942}, + urldate = {2015-05-09}, + booktitle = {Proceedings of the 2010 {ACM} {Conference} on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Antin, Judd and Cheshire, Coye}, + year = {2010}, + keywords = {free-riding, incomplete information, motivation, participation, social computing, wikipedia}, + pages = {127--130} +} + +@article{shaw_laboratories_2014, + title = {Laboratories of {Oligarchy}? {How} the {Iron} {Law} {Extends} to {Peer} {Production}}, + volume = {64}, + copyright = { 2014 International Communication Association}, + issn = {1460-2466}, + shorttitle = {Laboratories of {Oligarchy}?}, + url = {http://onlinelibrary.wiley.com.ezp-prod1.hul.harvard.edu/doi/10.1111/jcom.12082/abstract}, + doi = {10.1111/jcom.12082}, + language = {en}, + number = {2}, + urldate = {2015-05-09}, + journal = {Journal of Communication}, + author = {Shaw, Aaron and Hill, Benjamin M.}, + month = apr, + year = {2014}, + pages = {215--238} +} + +@inproceedings{priedhorsky_creating_2007, + address = {New York, NY, USA}, + series = {{GROUP} '07}, + title = {Creating, {Destroying}, and {Restoring} {Value} in {Wikipedia}}, + isbn = {978-1-59593-845-9}, + url = {http://doi.acm.org/10.1145/1316624.1316663}, + doi = {10.1145/1316624.1316663}, + urldate = {2015-03-08}, + booktitle = {Proceedings of the 2007 {International} {ACM} {Conference} on {Supporting} {Group} {Work}}, + publisher = {ACM}, + author = {Priedhorsky, Reid and Chen, Jilin and Lam, Shyong (Tony) K. and Panciera, Katherine and Terveen, Loren and Riedl, John}, + year = {2007}, + keywords = {collaboration, damage, vandalism, Wiki, wikipedia}, + pages = {259--268} +} + +@inproceedings{bryant_becoming_2005, + address = {New York, NY, USA}, + series = {{GROUP} '05}, + title = {Becoming {Wikipedian}: {Transformation} of {Participation} in a {Collaborative} {Online} {Encyclopedia}}, + isbn = {1-59593-223-2}, + shorttitle = {Becoming {Wikipedian}}, + url = {http://doi.acm.org/10.1145/1099203.1099205}, + doi = {10.1145/1099203.1099205}, + urldate = {2015-05-09}, + booktitle = {Proceedings of the 2005 {International} {ACM} {SIGGROUP} {Conference} on {Supporting} {Group} {Work}}, + publisher = {ACM}, + author = {Bryant, Susan L. and Forte, Andrea and Bruckman, Amy}, + year = {2005}, + keywords = {activity theory, community, legitimate peripheral participation, Wiki, wikipedia}, + pages = {1--10} +} + +@inproceedings{halfaker_jury_2009, + address = {New York, NY, USA}, + series = {{WikiSym} '09}, + title = {A {Jury} of {Your} {Peers}: {Quality}, {Experience} and {Ownership} in {Wikipedia}}, + isbn = {978-1-60558-730-1}, + shorttitle = {A {Jury} of {Your} {Peers}}, + url = {http://doi.acm.org/10.1145/1641309.1641332}, + doi = {10.1145/1641309.1641332}, + urldate = {2015-03-08}, + booktitle = {Proceedings of the 5th {International} {Symposium} on {Wikis} and {Open} {Collaboration}}, + publisher = {ACM}, + author = {Halfaker, Aaron and Kittur, Aniket and Kraut, Robert and Riedl, John}, + year = {2009}, + keywords = {experience, ownership, peer, peer review, quality, wikipedia, WikiWork}, + pages = {15:1--15:10} +} + +@inproceedings{butler_dont_2008, + address = {New York, NY, USA}, + series = {{CHI} '08}, + title = {Don't {Look} {Now}, but {We}'{Ve} {Created} a {Bureaucracy}: {The} {Nature} and {Roles} of {Policies} and {Rules} in {Wikipedia}}, + isbn = {978-1-60558-011-1}, + shorttitle = {Don't {Look} {Now}, but {We}'{Ve} {Created} a {Bureaucracy}}, + url = {http://doi.acm.org/10.1145/1357054.1357227}, + doi = {10.1145/1357054.1357227}, + urldate = {2015-05-09}, + booktitle = {Proceedings of the {SIGCHI} {Conference} on {Human} {Factors} in {Computing} {Systems}}, + publisher = {ACM}, + author = {Butler, Brian and Joyce, Elisabeth and Pike, Jacqueline}, + year = {2008}, + keywords = {collaboration, community, dynamics, policies, policy, rules, wikipedia, wikis}, + pages = {1101--1110} +} + +@inproceedings{guzdial_imagineering_2006, + address = {New York, NY, USA}, + series = {{ICER} '06}, + title = {Imagineering {Inauthentic} {Legitimate} {Peripheral} {Participation}: {An} {Instructional} {Design} {Approach} for {Motivating} {Computing} {Education}}, + isbn = {978-1-59593-494-9}, + shorttitle = {Imagineering {Inauthentic} {Legitimate} {Peripheral} {Participation}}, + url = {http://doi.acm.org/10.1145/1151588.1151597}, + doi = {10.1145/1151588.1151597}, + urldate = {2016-08-09}, + booktitle = {Proceedings of the {Second} {International} {Workshop} on {Computing} {Education} {Research}}, + publisher = {ACM}, + author = {Guzdial, Mark and Tew, Allison Elliott}, + year = {2006}, + keywords = {course design, CS1/2, non-majors, Programming}, + pages = {51--58} +} + +@inproceedings{suh_singularity_2009, + address = {New York, NY, USA}, + series = {{WikiSym} '09}, + title = {The {Singularity} is {Not} {Near}: {Slowing} {Growth} of {Wikipedia}}, + volume = {8}, + isbn = {978-1-60558-730-1}, + shorttitle = {The {Singularity} is {Not} {Near}}, + url = {http://doi.acm.org/10.1145/1641309.1641322}, + doi = {10.1145/1641309.1641322}, + urldate = {2016-04-21}, + booktitle = {Proceedings of the 5th {International} {Symposium} on {Wikis} and {Open} {Collaboration}}, + publisher = {ACM}, + author = {Suh, Bongwon and Convertino, Gregorio and Chi, Ed H. and Pirolli, Peter}, + year = {2009}, + keywords = {growth, logistic model, population, power law, resistance, wikipedia}, + pages = {1--10} +} + +@inproceedings{hsieh_welcome!:_2013, + address = {New York, NY, USA}, + series = {{CSCW} '13}, + title = {"{Welcome}!": {Social} and {Psychological} {Predictors} of {Volunteer} {Socializers} in {Online} {Communities}}, + isbn = {978-1-4503-1331-5}, + shorttitle = {"{Welcome}!"}, + url = {http://doi.acm.org/10.1145/2441776.2441870}, + doi = {10.1145/2441776.2441870}, + urldate = {2016-04-21}, + booktitle = {Proceedings of the 2013 {Conference} on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Hsieh, Gary and Hou, Youyang and Chen, Ian and Truong, Khai N.}, + year = {2013}, + keywords = {Online Communities, reddit, volunteer socialization}, + pages = {827--838} +} + +@inproceedings{reed_exploratory_2013, + title = {An {Exploratory} {Factor} {Analysis} of {Motivations} for {Participating} in {Zooniverse}, a {Collection} of {Virtual} {Citizen} {Science} {Projects}}, + doi = {10.1109/HICSS.2013.85}, + booktitle = {2013 46th {Hawaii} {International} {Conference} on {System} {Sciences} ({HICSS})}, + author = {Reed, J. and Raddick, M. J. and Lardner, A. and Carney, K.}, + month = jan, + year = {2013}, + keywords = {astronomy, computer mediated communication, Context, Correlation, Educational institutions, Electronic mail, exploratory factor analysis, Internet, Loading, Motivations, scientific information systems, social engagement, social sciences, three-factor solution, VCS projects, virtual citizen science projects, volunteer participation, Web site, Web sites, Web survey, Zooniverse}, + pages = {610--619} +} + +@inproceedings{olsen_evaluating_2007, + address = {New York, NY, USA}, + series = {{UIST} '07}, + title = {Evaluating {User} {Interface} {Systems} {Research}}, + isbn = {978-1-59593-679-0}, + url = {http://doi.acm.org/10.1145/1294211.1294256}, + doi = {10.1145/1294211.1294256}, + booktitle = {Proceedings of the 20th {Annual} {ACM} {Symposium} on {User} {Interface} {Software} and {Technology}}, + publisher = {ACM}, + author = {Olsen, Jr., Dan R.}, + year = {2007}, + keywords = {user interface systems evaluation}, + pages = {251--258} +} + +@inproceedings{klokmose_webstrates:_2015, + address = {New York, NY, USA}, + series = {{UIST} '15}, + title = {Webstrates: {Shareable} {Dynamic} {Media}}, + isbn = {978-1-4503-3779-3}, + shorttitle = {Webstrates}, + url = {http://doi.acm.org/10.1145/2807442.2807446}, + doi = {10.1145/2807442.2807446}, + urldate = {2016-04-07}, + booktitle = {Proceedings of the 28th {Annual} {ACM} {Symposium} on {User} {Interface} {Software} \& {Technology}}, + publisher = {ACM}, + author = {Klokmose, Clemens N. and Eagan, James R. and Baader, Siemen and Mackay, Wendy and Beaudouin-Lafon, Michel}, + year = {2015}, + keywords = {dynamic media, real-time collaborative documents, web}, + pages = {280--290} +} + +@article{christy_leaderboards_2014, + title = {Leaderboards in a virtual classroom: {A} test of stereotype threat and social comparison explanations for women's math performance}, + volume = {78}, + issn = {0360-1315}, + shorttitle = {Leaderboards in a virtual classroom}, + url = {http://www.sciencedirect.com/science/article/pii/S0360131514001195}, + doi = {10.1016/j.compedu.2014.05.005}, + urldate = {2016-01-23}, + journal = {Computers \& Education}, + author = {Christy, Katheryn R. and Fox, Jesse}, + month = sep, + year = {2014}, + keywords = {Distance education and telelearning, Gender studies, Human-computer interface, Interactive learning environments, Teaching/learning strategies}, + pages = {66--77} +} + +@inproceedings{hakulinen_effect_2014, + title = {The {Effect} of {Gamification} on {Students} with {Different} {Achievement} {Goal} {Orientations}}, + doi = {10.1109/LaTiCE.2014.10}, + booktitle = {2014 {International} {Conference} on {Teaching} and {Learning} in {Computing} and {Engineering} ({LaTiCE})}, + author = {Hakulinen, L. and Auvinen, T.}, + month = apr, + year = {2014}, + keywords = {achievement badges, achievement goal orientation, achievement goal orientation profile, algorithm course, analysis of variance, avoidance oriented group, computer aided instruction, computer games, computer science, Context, data analysis, data structure, data structures, educational courses, Educational institutions, Games, Gamification, gamification effect, Human factors, log data analysis, mastery extrinsic approach orientation, mastery intrinsic approach orientation, online learning environment, performance approach orientation, psychological conceptualization, Psychology, student behavior analysis, student preference characterization, students motivation}, + pages = {9--16} +} + +@incollection{hamari_persuasive_2014, + series = {Lecture {Notes} in {Computer} {Science}}, + title = {Do {Persuasive} {Technologies} {Persuade}? - {A} {Review} of {Empirical} {Studies}}, + copyright = {2014 Springer International Publishing Switzerland}, + isbn = {978-3-319-07126-8 978-3-319-07127-5}, + shorttitle = {Do {Persuasive} {Technologies} {Persuade}?}, + url = {http://link.springer.com/chapter/10.1007/978-3-319-07127-5_11}, + language = {en}, + number = {8462}, + urldate = {2016-01-23}, + booktitle = {Persuasive {Technology}}, + publisher = {Springer International Publishing}, + author = {Hamari, Juho and Koivisto, Jonna and Pakkanen, Tuomas}, + editor = {Spagnolli, Anna and Chittaro, Luca and Gamberini, Luciano}, + month = may, + year = {2014}, + note = {DOI: 10.1007/978-3-319-07127-5\_11}, + keywords = {behavioral change support system, captology, Computers and Society, game-based learning, Gamification, Health Informatics, health technology, Information Systems Applications (incl. Internet), motivational affordance, Multimedia Information Systems, persuasive computing, Persuasive technology, sustainability, User Interfaces and Human Computer Interaction}, + pages = {118--136} +} + +@inproceedings{hamari_does_2014, + address = {Washington, DC, USA}, + series = {{HICSS} '14}, + title = {Does {Gamification} {Work}? - {A} {Literature} {Review} of {Empirical} {Studies} on {Gamification}}, + isbn = {978-1-4799-2504-9}, + shorttitle = {Does {Gamification} {Work}?}, + url = {http://dx.doi.org/10.1109/HICSS.2014.377}, + doi = {10.1109/HICSS.2014.377}, + urldate = {2016-01-23}, + booktitle = {Proceedings of the 2014 47th {Hawaii} {International} {Conference} on {System} {Sciences}}, + publisher = {IEEE Computer Society}, + author = {Hamari, Juho and Koivisto, Jonna and Sarsa, Harri}, + year = {2014}, + keywords = {Gamification, hci, motivation, motivational affordance, Persuasive technology}, + pages = {3025--3034} +} + +@article{hanus_assessing_2015, + title = {Assessing the effects of gamification in the classroom: {A} longitudinal study on intrinsic motivation, social comparison, satisfaction, effort, and academic performance}, + volume = {80}, + issn = {0360-1315}, + shorttitle = {Assessing the effects of gamification in the classroom}, + url = {http://www.sciencedirect.com/science/article/pii/S0360131514002000}, + doi = {10.1016/j.compedu.2014.08.019}, + urldate = {2016-01-23}, + journal = {Computers \& Education}, + author = {Hanus, Michael D. and Fox, Jesse}, + month = jan, + year = {2015}, + keywords = {Human-computer interface, Improving classroom teaching, Interactive learning environments, Teaching strategies, Virtual reality}, + pages = {152--161} +} + +@article{koivisto_demographic_2014, + title = {Demographic differences in perceived benefits from gamification}, + volume = {35}, + issn = {0747-5632}, + url = {http://www.sciencedirect.com/science/article/pii/S0747563214001289}, + doi = {10.1016/j.chb.2014.03.007}, + urldate = {2016-01-23}, + journal = {Computers in Human Behavior}, + author = {Koivisto, Jonna and Hamari, Juho}, + month = jun, + year = {2014}, + keywords = {Demographics, Games for health, Gamification, Gender, Persuasive technology, social networking}, + pages = {179--188} +} + +@article{zuckerman_deconstructing_2014, + title = {Deconstructing gamification: evaluating the effectiveness of continuous measurement, virtual rewards, and social comparison for promoting physical activity}, + volume = {18}, + issn = {1617-4909, 1617-4917}, + shorttitle = {Deconstructing gamification}, + url = {http://link.springer.com/article/10.1007/s00779-014-0783-2}, + doi = {10.1007/s00779-014-0783-2}, + language = {en}, + number = {7}, + urldate = {2016-01-23}, + journal = {Personal and Ubiquitous Computing}, + author = {Zuckerman, Oren and Gal-Oz, Ayelet}, + month = jul, + year = {2014}, + keywords = {Behavior change, Computer Science, general, Gamification, Personal Computing, Persuasive technology, Physical activity, Social comparison, User Interfaces and Human Computer Interaction, Virtual reward}, + pages = {1705--1719} +} + +@inproceedings{zhu_effectiveness_2012, + address = {New York, NY, USA}, + series = {{CSCW} '12}, + title = {Effectiveness of {Shared} {Leadership} in {Online} {Communities}}, + isbn = {978-1-4503-1086-4}, + url = {http://doi.acm.org/10.1145/2145204.2145269}, + doi = {10.1145/2145204.2145269}, + urldate = {2016-05-13}, + booktitle = {Proceedings of the {ACM} 2012 {Conference} on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Zhu, Haiyi and Kraut, Robert and Kittur, Aniket}, + year = {2012}, + keywords = {motivation, Online Communities, shared leadership, wikipedia}, + pages = {407--416} +} + +@article{joseph_designing_2003, + title = {Designing {Interesting} {Learning} {Environments} {When} the {Medium} isn't enough}, + volume = {9}, + issn = {1354-8565, 1748-7382}, + url = {http://con.sagepub.com/content/9/2/84}, + doi = {10.1177/135485650300900207}, + language = {en}, + number = {2}, + urldate = {2016-08-09}, + journal = {Convergence: The International Journal of Research into New Media Technologies}, + author = {Joseph, Diana and Nacu, Denise C.}, + month = jun, + year = {2003}, + pages = {84--115} +} + +@inproceedings{le_self-presentation:_2010, + title = {Self-{Presentation}: {Structured} and semi-structured user profiles}, + booktitle = {Studying {Online} {Behavior} {Workshop}}, + author = {Le, Linda and Beschastnikh, Ivan and McDonald, David W.}, + year = {2010} +} + +@inproceedings{mugar_planet_2014, + address = {New York, NY, USA}, + series = {{CSCW} '14}, + title = {Planet {Hunters} and {Seafloor} {Explorers}: {Legitimate} {Peripheral} {Participation} {Through} {Practice} {Proxies} in {Online} {Citizen} {Science}}, + isbn = {978-1-4503-2540-0}, + shorttitle = {Planet {Hunters} and {Seafloor} {Explorers}}, + url = {http://doi.acm.org/10.1145/2531602.2531721}, + doi = {10.1145/2531602.2531721}, + urldate = {2016-05-26}, + booktitle = {Proceedings of the 17th {ACM} {Conference} on {Computer} {Supported} {Cooperative} {Work} \& {Social} {Computing}}, + publisher = {ACM}, + author = {Mugar, Gabriel and \O{}sterlund, Carsten and Hassman, Katie DeVries and Crowston, Kevin and Jackson, Corey Brian}, + year = {2014}, + keywords = {citizen science, legitimate peripheral participation, situated learning, socialization, social translucence}, + pages = {109--119} +} + +@article{ducheneaut_socialization_2005, + title = {Socialization in an {Open} {Source} {Software} {Community}: {A} {Socio}-{Technical} {Analysis}}, + volume = {14}, + issn = {1573-7551}, + url = {http://dx.doi.org/10.1007/s10606-005-9000-1}, + doi = {10.1007/s10606-005-9000-1}, + number = {4}, + journal = {Computer Supported Cooperative Work (CSCW)}, + author = {Ducheneaut, Nicolas}, + year = {2005}, + pages = {323--368} +} + +@inproceedings{ciampaglia_moodbar:_2015, + address = {New York, NY, USA}, + series = {{CSCW} '15}, + title = {{MoodBar}: {Increasing} {New} {User} {Retention} in {Wikipedia} {Through} {Lightweight} {Socialization}}, + isbn = {978-1-4503-2922-4}, + shorttitle = {{MoodBar}}, + url = {http://doi.acm.org/10.1145/2675133.2675181}, + doi = {10.1145/2675133.2675181}, + urldate = {2016-05-26}, + booktitle = {Proceedings of the 18th {ACM} {Conference} on {Computer} {Supported} {Cooperative} {Work} \& {Social} {Computing}}, + publisher = {ACM}, + author = {Ciampaglia, Giovanni Luca and Taraborelli, Dario}, + year = {2015}, + keywords = {Experiment, online community, socialization, user retention, wikipedia}, + pages = {734--742} +} + +@inproceedings{farzan_socializing_2012, + address = {New York, NY, USA}, + series = {{CSCW} '12}, + title = {Socializing {Volunteers} in an {Online} {Community}: {A} {Field} {Experiment}}, + isbn = {978-1-4503-1086-4}, + shorttitle = {Socializing {Volunteers} in an {Online} {Community}}, + url = {http://doi.acm.org/10.1145/2145204.2145256}, + doi = {10.1145/2145204.2145256}, + urldate = {2016-05-26}, + booktitle = {Proceedings of the {ACM} 2012 {Conference} on {Computer} {Supported} {Cooperative} {Work}}, + publisher = {ACM}, + author = {Farzan, Rosta and Kraut, Robert and Pal, Aditya and Konstan, Joseph}, + year = {2012}, + keywords = {Experiment, online volunteer communities, socialization}, + pages = {325--334} +} + +@inproceedings{geiger_defense_2012, + address = {Dublin, Ireland}, + title = {Defense {Mechanism} or {Socialization} {Tactic}? {Improving} {Wikipedia}'s {Notifications} to {Rejected} {Contributors}}, + shorttitle = {Defense {Mechanism} or {Socialization} {Tactic}?}, + url = {http://www.aaai.org/ocs/index.php/ICWSM/ICWSM12/paper/view/4657}, + urldate = {2016-05-27}, + booktitle = {Sixth {International} {AAAI} {Conference} on {Weblogs} and {Social} {Media}}, + publisher = {AAAI Publications}, + author = {Geiger, R. Stuart and Halfaker, Aaron and Pinchuk, Maryana and Walling, Steven}, + month = may, + year = {2012}, + pages = {122--129} +} + +@inproceedings{musicant_mentoring_2011, + address = {New York, NY, USA}, + series = {{WikiSym} '11}, + title = {Mentoring in {Wikipedia}: {A} {Clash} of {Cultures}}, + isbn = {978-1-4503-0909-7}, + shorttitle = {Mentoring in {Wikipedia}}, + url = {http://doi.acm.org/10.1145/2038558.2038586}, + doi = {10.1145/2038558.2038586}, + urldate = {2016-05-27}, + booktitle = {Proceedings of the 7th {International} {Symposium} on {Wikis} and {Open} {Collaboration}}, + publisher = {ACM}, + author = {Musicant, David R. and Ren, Yuqing and Johnson, James A. and Riedl, John}, + year = {2011}, + keywords = {mentoring, newcomer retention, wikipedia}, + pages = {173--182} +} + +@incollection{deci_intrinsic_1975, + address = {New York NY}, + title = {Intrinsic {Motivation}}, + copyright = {Copyright 2010 John Wiley \& Sons, Inc. All rights reserved.}, + isbn = {978-0-470-47921-6}, + url = {http://onlinelibrary.wiley.com/doi/10.1002/9780470479216.corpsy0467/abstract}, + language = {en}, + urldate = {2016-08-09}, + booktitle = {The {Corsini} {Encyclopedia} of {Psychology}}, + publisher = {John Wiley \& Sons, Inc.}, + author = {Deci, Edward L. and Ryan, Richard M.}, + year = {1975}, + keywords = {extrinsic rewards, needs for autonomy and competence, play, undermining intrinsic motivation} +} + +@inproceedings{zhang_how_2012, + title = {How long do {Wikipedia} editors keep active?}, + isbn = {978-1-4503-1605-7}, + url = {http://dl.acm.org/citation.cfm?doid=2462932.2462938}, + doi = {10.1145/2462932.2462938}, + language = {en}, + urldate = {2016-08-09}, + publisher = {ACM Press}, + author = {Zhang, Dell and Prior, Karl and Levene, Mark}, + year = {2012}, + pages = {1} +} + +@book{harrison_supporting_2002, + title = {Supporting {Lifelong} {Learning}: {Perspectives} on learning}, + isbn = {978-0-415-25927-9}, + shorttitle = {Supporting {Lifelong} {Learning}}, + language = {en}, + publisher = {Psychology Press}, + author = {Harrison, Roger and Reeve, Fiona}, + year = {2002}, + keywords = {Education / Adult \& Continuing Education, Education / General, Education / History, History / General} +} + +@inproceedings{ekstrand_rv_2009, + address = {New York, NY, USA}, + series = {{WikiSym} '09}, + title = {rv you're dumb: {Identifying} {Discarded} {Work} in {Wiki} {Article} {History}}, + isbn = {978-1-60558-730-1}, + shorttitle = {rv you're dumb}, + url = {http://doi.acm.org/10.1145/1641309.1641317}, + doi = {10.1145/1641309.1641317}, + urldate = {2016-08-09}, + booktitle = {Proceedings of the 5th {International} {Symposium} on {Wikis} and {Open} {Collaboration}}, + publisher = {ACM}, + author = {Ekstrand, Michael D. and Riedl, John T.}, + year = {2009}, + keywords = {article history, visualization, Wiki, wikipedia}, + pages = {4:1--4:10} +} + +@book{gerber_field_2012, + address = {New York}, + title = {Field {Experiments}: {Design}, {Analysis}, and {Interpretation}}, + publisher = {WW Norton}, + author = {Gerber, Alan S and Green, Donald P}, + year = {2012} +} + +@inproceedings{farzan_wikipedia_2013, + address = {New York, NY, USA}, + series = {{CHI} '13}, + title = {Wikipedia {Classroom} {Experiment}: {Bidirectional} {Benefits} of {Students}' {Engagement} in {Online} {Production} {Communities}}, + isbn = {978-1-4503-1899-0}, + shorttitle = {Wikipedia {Classroom} {Experiment}}, + url = {http://doi.acm.org/10.1145/2470654.2470765}, + doi = {10.1145/2470654.2470765}, + urldate = {2016-08-09}, + booktitle = {Proceedings of the {SIGCHI} {Conference} on {Human} {Factors} in {Computing} {Systems}}, + publisher = {ACM}, + author = {Farzan, Rosta and Kraut, Robert E.}, + year = {2013}, + keywords = {Experiment, online volunteer community, socialization}, + pages = {783--792} +} + +@incollection{potthast_automatic_2008, + address = {Berlin, Germany}, + series = {Lecture {Notes} in {Computer} {Science}}, + title = {Automatic {Vandalism} {Detection} in {Wikipedia}}, + copyright = {2008 Springer-Verlag Berlin Heidelberg}, + isbn = {978-3-540-78645-0 978-3-540-78646-7}, + url = {http://link.springer.com/chapter/10.1007/978-3-540-78646-7_75}, + language = {en}, + number = {4956}, + urldate = {2016-08-09}, + booktitle = {Advances in {Information} {Retrieval}}, + publisher = {Springer}, + author = {Potthast, Martin and Stein, Benno and Gerling, Robert}, + editor = {Macdonald, Craig and Ounis, Iadh and Plachouras, Vassilis and Ruthven, Ian and White, Ryen W.}, + month = mar, + year = {2008}, + note = {DOI: 10.1007/978-3-540-78646-7\_75}, + keywords = {Artificial Intelligence (incl. Robotics), Database Management, Data Mining and Knowledge Discovery, Information Storage and Retrieval, Information Systems Applications (incl. Internet), Multimedia Information Systems}, + pages = {663--668} +} \ No newline at end of file diff --git a/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-reviews-round1.txt b/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-reviews-round1.txt new file mode 100644 index 0000000..ed0406f --- /dev/null +++ b/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-reviews-round1.txt @@ -0,0 +1,718 @@ +From: +Date: Tue, Jul 12, 2016 at 11:15 PM +Subject: CSCW 2017 notification - #516 +To: snehanarayan@gmail.com +Cc: papers2017@cscw.acm.org + + +Dear Sneha Narayan - + +Congratulations! + +Your paper: + +516 - The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + +is one of the 52% of CSCW 2017 submissions invited to revise and resubmit. +There were 530 total submissions to CSCW 2017, a similar number to last +year. The reviewers for this submission believe that it has the potential +to be revised within four weeks -- revisions are due August 9, 2016 -- to +become a contribution to what will be an exceptional conference. + +The program committee expects all authors to take advantage of this four +week revision period to improve their submissions by addressing reviewers' +comments (below). Some submissions need only minor revisions, while others +will require considerable work over the next four weeks to result in an +acceptable submission, and will not succeed without significant effort. +Your reviews, especially the summary report from the Coordinator, should +make clear what you should do. You can gauge your prospects from your +reviews and the summary report: overall scores of 4s and 5s indicate the +reviewers are very confident your paper will be acceptable within four +weeks with small edits. Overall scores of 3 and 4 indicate you have some +work to do. Scores of 3 and below indicate that some reviewers have serious +reservations, though other reviewers see promise. + +The same reviewers will read and evaluate your revised submission (though +additional reviewers may be added for papers where the reviewers are +divided). You need not satisfy every reviewer or make every suggested +change, but your revision will need to convince most of the reviewers that +it is now ready for publication. For some papers the reviewers have +requested a lot of work, you might feel that it is too much to achieve in a +four week period. If you have the time to reach that goal: great! If not, +that is okay, you are free to withdraw your submission. Please decide +whether or not the key points made by reviewers can be adequately addressed +in the time provided, given other demands on your time. If you choose to +withdraw your paper, please notify us explicitly at papers2017@cscw.acm.org. +Papers that are revised and re-submitted in the next round will receive +revised reviews. + +Your revision must be accompanied by a separate "Summary of Changes" +document (in PDF format) that lists the reviewers' comments and your +responses, even for comments that did not lead to changes in the manuscript +(in which case you might explain why you chose not to make certain +suggested changes). This could be a set of bullet points, a table, or +numbered points by which reviewers' comments are summarized along with your +changes. This is not a rebuttal, but rather a description of changes made, +or of reasons you could not or chose not to take the reviewers' advice. To +become acceptable, your submission must be revised, and your document +describing the changes will greatly help reviewers see what you have or +have not changed, along with your reasons for doing so. + +Just to be clear, you must submit a revised paper and summary of changes by +the deadline. Any paper where a revision and summary are not submitted +will be considered to be withdrawn. + +Example summaries from past years' papers can be found at +http://bit.ly/16U8BGM. + +Please submit your revision and the response document at your "Submissions +in Progress" page at https://precisionconference.com/~cscw17a/ by 11:59 PM +PDT, August 9, 2016. + +CSCW 2017 will be a great conference, and we sincerely hope you are part of +it! If you have any issues or questions, please let us know. And thanks +again for submitting. + +Sincerely, +Louise Barkhuus, Marcos Borges, Wendy A. Kellogg +CSCW 2017 Co-chairs + + + +------------------------ Submission 516, Review 4 ------------------------ + +Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + +Reviewer: AC + +Expertise + + 2 (Passing Knowledge) + +First Round Overall Recommendation + + 3 (Maybe acceptable (with significant modifications)) + +Contribution and Criteria for Evaluation + + The paper presents the design and evaluation of a gamified tool for + socializing and retaining new Wikipedia editors. Contribution criteria + include (1) a description and rationale for the system; (2) system + novelty and rationale for how it leads to learning; and (3) a + methodologically sound evaluation. + +First Round Review (if needed) + + +Coordinator's First-Round Report to Authors + + The paper presents the design and evaluation of a gamified tool for + socializing and retaining new Wikipedia editors. The study found that + users liked—but did not learn from—the system. + + The focus on improving the experience of newcomers in Wikipedia is + relevant and important. Reviewers describe the study as well motivated + and exceptionally well-written. Read R3’s comments on the writing + quality and congratulate yourself! + + The reviewers, however, have many concerns about the paper—each + focusing on a different aspect of the work. The concerns the reviewers + note /may/ be addressable during the revise and resubmit period, but it + will be an exceptionally herculean effort. Also, please keep in mind that + there is no guarantee of acceptance even after making changes. So, it is + at the authors’ discretion about whether or not to proceed with + revisions or withdraw the paper. + + There is split amongst the reviewers as to whether the failure of the + tool is interesting or not. R1 raises concerns that the failure of the + tool could be predicted from existing literature, suggesting little + rationale for doing the work in the first place. R2 asks whether there + is something fundamentally different about people who continue to + contribute to Wikipedia, and as such whether the system holds value in + practice. R3, on the other hand, sees much value in the systems + contribution of the work as well as the real-world evaluation. R3's + review has some suggestions of alternative framings that may make the + contribution more valuable. + + In treatment of related work, many improvements are needed. R1 notes that + the discussion of the well-known concept of legitimacy/authenticity in + learning environments is missing. R2 also points to missing literature + about Wikipedian experience. + + R2 and R3 raise a number of methodological questions about the paper. R2 + suggests the distribution of participants across the timeline may bias + the results. R3, on the other hand, sees opportunity here, suggesting + additional statistical analysis related to longevity and power users. + Both R2 and R3 question the methodological choice and contribution of + measuring perceptions of learning rather than actual learning. Overall, + this points to a need for at the very least justifying the methodological + choices and at the most carrying out additional statistical analyses. + + In summary, there is quite a bit of work to be done. I wish the authors + the best of luck, should they choose to continue in the review process. + + +Requested Revisions + + REQUIRED: + - Provide justification for why the study was worth carrying out, in + response to R1 and R2’s concerns. R3’s review may have some insight + into alternative framings. + - State the research questions more explicitly, as per R2’s + recommendation + - Address R1 and R2’s concerns about missing literature + - Ensure that the narrative around Wikipedia is clear to readers who +do + not have an in-depth background in production/editing details. + - Improve the clarity of the results by using percentages or another + baseline that allows comparison between numbers, as per R2’s review. + - Provide justification for measuring perceptions of learning versus + actual learning. + - Provide a robust discussion of why the results are meaningful for + researchers and/or practitioners. + + OPTIONAL, RECOMMENDED + - Consider carrying out additional statistical analyses as +recommended by + R3. + - Provide a short justification for use of English language +Wikipedia, as + per R2’s review. + +Formatting and Reference Issues + + + +------------------------ Submission 516, Review 1 ------------------------ + +Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + + +Expertise + + 4 (Expert) + +First Round Overall Recommendation + + 3 (Maybe acceptable (with significant modifications)) + +Contribution and Criteria for Evaluation + + In this paper, the authors present the design and two-pronged evaluation + of a tutorial for new wikipedia editors that uses elements of + gamification like missions and badges to help coach new editors and help + them learn best practices and social norms of Wikipedia. The outcome is + that users like the system but, based on behavioral measures, they don't + actually learn from it. Learning interventions are a classic kind of + research problem, and the paper should include robust measures of + learning, as well as a good description of the designed intervention + itself, why the design is expected to lead to learning, and a clear + description of the study. + +Assessment of the Paper + + This is a reasonably well motivated study with connections to appropriate + literature and the writing is engaging and understandable. The problem of + enculturating newcomers into projects like Wikipedia is well documented + and this paper investigates a potential intervention with an admirably + well-planned study. Designing learning interventions is really difficult + and I commend the authors on a well-executed effort. + + Still, I am ambivalent about the paper because I would have predicted + these outcomes based on the literature alone. In the discussion, the + authors note that one mismatch between Wikipedia and the tutorial as + designed involve the “gradual peripheral participation” of newcomers + as they take on the identity of “Wikipedian.” They suggest that maybe + speeding up this process is unnatural. I would argue that the most + important concept from the literature on learning is missing from this + discussion, and that’s “legitimacy” (also sometimes referred to in + education and learning literature as “authenticity”.) The authors + explain that by doing tasks in a pretend version of Wikipedia, they make + it a safe space for newcomers to practice, yet performing “canned” + tasks in a pretend system is the opposite of offering a legitimate form + of participation. I immediately wonder, why not use what we know from the + literature to create low-risk missions that newcomers can complete while + legitimately contributing to the encyclopedia? Risk taking is a + fundamental characteristic of games that makes them engaging; it + certainly seems like it would play a role in people’s motivation in a + scenario like this. Rather than eliminating risk, the literature on + legitimate peripheral participation would suggest that finding the right + degree of risk is required to facilitate progressive entree into a set of + shared practices. + + I am disappointed by the missed opportunity here, the outcome mainly + seems to verify that what we know shouldn’t work based on the + literature in fact doesn’t work. Yet still the paper isn’t bad and + the study is carefully crafted and reported. + + With some extension and reflection, I think the discussion could help + point future research in a more fruitful direction. There are millions of + pages written on the challenges of designing learning interventions that + change people’s behavior, this paper ends on a painfully obvious note. + It’s true that usability isn’t all it takes, but what can we learn + from TWA adventure about the design of systems to facilitate + enculturation into a community of practice? What can we take away from + this that might inform more successful tutorial systems in the future? + +Formatting and Reference Issues + + + +------------------------ Submission 516, Review 2 ------------------------ + +Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + + +Expertise + + 4 (Expert) + +First Round Overall Recommendation + + 2 (Probably NOT acceptable) + +Contribution and Criteria for Evaluation + + This paper's contribution is the design and evaluation of a structured + introduction to a peer production community (English Wikipedia) called + "The Wikipedia Adventure". TWA's design is rooted in theories of + gamification, and its utility is evaluated through a user survey and an + invitation-based field experiment. The paper reports on the survey + respondents' satisfaction with TWA, and how their experiment results + reveal some of the challenges of affecting lasting changes to contributor + patterns in peer production communities. These findings are then + discussed in relation to cultural factors in Wikipedia, issuses of + self-selection and voluntary participation, and the limitations of + gamification. + + When evaluating a paper that describes the design of a system, the two + main criteria are that the system and/or its development setting is/are + novel, and that the way the system is evaluated is methodologically + sound. + +Assessment of the Paper + + As mentioned in the contribution section, this paper's contribution is + the design and evaluation of a structured introduction to a peer + production community based on gamification, called "The Wikipedia + Adventure". This is a great idea and sounds like a useful addition to + Wikipedia. The paper is written in a way that makes it easy to read, and + provides the reader with a good introduction to how TWA's design is + rooted in theories of gamification, thus applying these principles in + what appears to be a novel setting. The paper also does a good job of + discussing the findings, organizing them in a way that is easy to follow + and touching on important points (e.g. cultural factors, and the + limitations of self-selection and gamification). + + The overall ideas and approach taken in this paper are sound, they are in + line with the criteria described previously. Unfortunately, there are + two major issues and several minor ones that need to be resolved before + this paper is ready for publication. The first major issue is that the + methodology used to evaluate performance in the invitation-based + experiment measures contribution in a skewed manner and does not + establish why that is appropriate. Secondly, the paper fails to consider + arguments put forth by Panciera et al's "Wikipedians are Born, Not Made" + paper. This review will expand on both of these major issues below. + Further below will be notes and comments with suggestions for improvement + for specific sections of the paper, some of which are rather substantial + as well. + + 1: Evaluating TWA effectiveness by number of contributions + ---------------------------------------------------------- + + A major part of the paper is the evaluation of TWA's effect on subsequent + contributions. To evaluate this an invitation-based field experiment is + used, and the paper does a great job of justifying why that is + appropriate in this setting. The experiment runs from February 2014 and + three months forward. Exact dates are not given, so let us assume that + it ran until the end of April 2014. User contributions are then measured + until the end of May 2014. + + There are two problems with this approach that the paper fails to address + properly. One is the issue of right-truncation found in the data. + Contributors who joined in early February 2014 would have about four + months to make edits, whereas those who joined in late April would only + have about a month. The model does contain a control variable for number + of days in the experiment, but why is that appropriate in this context? + If we examine other work in the same domain, they tend to either use a + much longer time period (e.g. the Teahouse paper, citation 23, which uses + 6-9 months) or ensure that the time period is fixed (e.g. Kittur et al. + "Herding the Cats: The Influence of Groups in Coordinating Peer + Production", WikiSym 2009; or Zhu et al. "Effectiveness of Shared + Leadership in Online Communities", CSCW 2012). + + Related to the right-truncation problem is the fact that the paper also + fails to discuss and justify what a reasonable timespan for measuring the + effect of TWA is, and that it will have an effect on the number of + contributions made. It might for instance be that TWA instead has an + effect on how long it takes before a user drops out of the system. If we + assume that TWA has an effect on contributions, what timespan is needed + to measure that effect? The paper assumes that a month is adequate to + discover it, whereas one might suspect that it is only measurable over a + longer period of time. If it is the case that a short period of time is + appropriate (for instance because these users are likely to drop out + after a certain amount of time) the paper needs to properly establish + that, either by measuring it or referring to previous work. + + 2: Wikipedians Are Born, Not Made + --------------------------------- + + In their GROUP 2009 paper "Wikipedians Are Born, Not Made: A Study of + Power Editors on Wikipedia", Panciera et al. show data that argues that + those contributors who are going to stick around behave in a way that is + different from the very beginning. In followup work published in 2010 + they find similar differences in another peer production community. + (Panciera et al. "Lurking? cyclopaths?: a quantitative lifecycle analysis + of user behavior in a geowiki." CHI 2010) + + These two papers and the argument they put forth are relevant because + they question who TWA is designed for. In the related work a reference to + Bryant et al's "Becoming Wikipedian" is made, thereby suggesting that TWA + is designed to teach someone how to be a Wikipedian. As Panciera et al's + paper argues along the lines of these contributors already being + Wikipedians, should TWA be designed to instead help these contributors + stay productive? + + If Wikipedians are born, not made, then one could also question whether + these contributors are at all going to use TWA. Maybe they ignore TWA + because they are already productive and do not need it? Since the paper + never makes any references to these papers and discusses issues related + to this (e.g. "is the Teahouse more effective since it allows them to get + answers when they need help?"), this whole topic area is left hanging. + + --- + Below follows comments/notes for each section of the paper. + + Introduction: + * An overall issue here is that there are few citations to sources. For + instance a claim is made that "newly created accounts are the primary + source of spam and vandalism on Wikipedia". Consider a "[citation + needed]" added after that. + * When citing multiple papers it is preferable that they are in order, + e.g "[14, 23, 17]" should be "[14, 17, 23]" (page 1). This minor issue + also occurs elsewhere in the paper. + * "Unlike prior systems, TWA creates a structured experience that guides + newcomers through critical pieces of Wikipedia knowledge..." Do we know + that there are no other prior systems that offer a similar experience? It + might be that there are none within the Wikipedia domain, but what about + outside it? That sentence is making a rather bold claim. + * After reading the introduction, what is the reader expected to remember + as the main findings in this paper? At the end of the introduction the + following sentence is found: "The study underscores the importance of + conducting multiple types of evaluations of social systems." Is that the + main contribution? What about the implications for gamified structured + introductions to peer production? + + Background: + * "...women reported that they found that contributing to Wikipedia + involved a high level of conflict and that they lacked confidence in + their expertise [8]. This suggests that more effective onboarding tools + could help incorporate newcomers." This is an important side of + Wikipedia, but how does TWA's design help mitigate this issue? Are there + design elements in TWA that aims to boost confidence in one's expertise? + * At the end of the introduction we find the following two questions: + "Would a gamified tutorial produce a positive, enjoyable educational + experience for new Wikipedians? Would playing the tutorial impact + newcomer participation patterns?" These are the paper's _research + questions_! It would be very helpful to the reader if they were displayed + more clearly, e.g. as separate items. They should not be hidden. + + System Design: + * "...it does not depend on the availability, helpfulness, or + intervention of existing Wikipedia editors..." The underlying argument + here is that scalability is preferable to personal interaction when + socializing newcomers (in peer production communities). Why is that the + better solution? As discussed previously, TWA might be designed for + contributors who are not going to stick around, why are those the right + audience for it? Is the goal to provide _everyone_ with a scalable + impersonale introduction, or is it better to provide _some_ (typically + based on self-selection) with a personal introduction (e.g. the + Teahouse)? + + Game-like elements (subsection of System Design): + * In "Missions" a distinction is made between "basic" and "advanced" + editing techniques. It appears to be somewhat arbitrary, why is adding + sources advanced editing, but watchlists are not? + * Your readers might not now what watchlists are, take care to write for + a general audience, not everyone knows a lot about how Wikipedia works + behind the scenes. + + Study 1: User Survey: + * This paper doesn't discuss any other language editions of Wikipedia + besides the English one, and makes the assumption that "Wikipedia" equals + the English edition. Adding a mention that Wikipedia exists in multiple + languages and explaining why English was chosen as the language where + TWA was launched would be very helpful. + * The paper aims to measure "educational effectiveness". Why is a survey + the appropriate way to measure that? Based on the description of the + survey, it seems that it never asks specific questions to test whether + TWA's users learned specific things, in other words whether the education + was successful. Later when describing the results the phrase "learning to + edit Wikipedia" is used, isn't that the _key_ learning goal of TWA? Yet + the survey asks Likert-scale questions. In other words, you're measuring + whether TWA users are under the impression that they learned something, + not whether they actually did. + * Figure 4 uses counts. While it shows that none of the questions had + responses from all participants, it makes comparisions between questions + with different response rates very difficult. Using percentages would + allow for direct comparisons, and makes the references to the figure in + the text easier to follow along with. The text refers to four questions + with a certain percentage of responses, but leaves the math to the + reader. + * The survey leaves many questions unanswered, some of which the paper + might want to address. Were any negative questions asked? Were there any + control questions, such as a similar question worded slightly differently + to allow for comparison between responses? As it is, this survey comes + across as a set of positive statements about TWA that respondents agreed + to. Given that respondents self-select and no attempts to contact users + who didn't go through TWA appears to have been made, it is likely there + is a bias in the responses, and that bias should be discussed. + + Study 2: Field Experiment: + * The description of how accounts were selected to be included is rather + confusing. First it describes 1,967 accounts that met the same criteria + as for the user survey, however 10,000 individuals ("accounts"?) were + invited to the beta. Why is one an order of magnitude larger than the + other? Then in the second paragraph of "Methods" it describes the + selection criteria, that at least one contribution would have to be made + after getting invited. This would perhaps be much less confusing if the + criteria were first explained, particularly how the experiment and + control groups were set up, and then how many accounts were identified. + * "This is a larger proportion of users than took up the invitation in + Study 1, which may be due to changes in the invitation text." Earlier in + the paper study 1 refers to a "beta", whereas this appears to be not. If + this is the case, this is an important difference between the two that + should be made clear to the reader. + * "we measure the overall contributions as the total number of edits made + by each account from the time of inclusion in the study until May 31, + 2014." When exactly is "time of inclusion", is that when they got the + invite? What about when they completed one (or all) TWA mission(s)? The + concern here is that all contributions are measured, whereas the + experiment sets up a pre/post-scenario. Later on the paper refers to + "subsequent contributions", indicating that contributions after a certain + point in time was measured. This quickly becomes rather confusing, + spelling out clearly what points in a user's account history is used + (e.g. "we measure contributions at four points in time: when the user + registered their account, the time of invitation, when they first started + using TWA, and the end of the experiment") would be very helpful. + * Why is a six-edit radius chosen when measuring word persistence? + Halfaker et al. make no claim about what the radius should be in the + referenced work, and Ekstrand et al suggest a 15 edit radius in a related + paper (Ekstrand and Riedl "rv you're dumb: identifying discarded work in + Wiki article history." WikiSym 2009) The six-edit radius also comes with + an issue that is unadressed: how long does it take for an edit made by a + contributor in the study to reach that six-edit radius? If it hasn't been + reached at the end of the study period, that edit has to be discarded as + its quality is unknown. In a related paper, Farzan and Kraut instead + chose to use percentage of words that survived as a measure of quality + (Farzan and Kraut "Wikipedia classroom experiment: bidirectional benefits + of students' engagement in online production communities" CHI 2013) + * Tables 1, 2, 3, and 4, as well as figure 6 should be brought closer + together so it's easier to follow along. Table 1 occurs before the text + that refers to it, and table 4 is two pages further along. Putting all + tables and figure 6 on the same page might be a good solution. + * Table 3 refers to users "reached" a mission. It is confusing how 181 + users reached the final mission but did not complete it, yet in the text + it seems these 181 users actually did. + * The post-hoc power analysis is very useful! + + Discussion: + * "The new editors in our study may have had unpleasant experiences + during their initial time on Wikipedia..." It appears that the survey + asked no questions about this, yet is it not a very important issue + related to TWA's success? + * In "Limitations of gamification" the following sentence is found: + "...our study is among the first that compares levels of participation in + a task among individuals who were introduced to gamified learning first + to those that were not." This is an _important_ finding, it shouldn't be + hidden back here but instead be up front in the introduction! + +Formatting and Reference Issues + + + +------------------------ Submission 516, Review 3 ------------------------ + +Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + +Reviewer: AC-Reviewer + +Expertise + + 4 (Expert) + +First Round Overall Recommendation + + 3 (Maybe acceptable (with significant modifications)) + +Contribution and Criteria for Evaluation + + This paper presents the results of a deployment of a gameification-based + system designed to retain new editors in Wikipedia. It is a negative + results paper: the authors claim that they have conclusive evidence that + the system did not work (although I have suggested a few additional lines + of inquiry below that might problematize this assertion). + + The committee will have to have a discussion about how to evaluate this + paper, and likely negative results papers more generally. + +Assessment of the Paper + + This paper presents the results of a deployment of a gameification-based + system designed to retain new editors in Wikipedia. It is a negative + results paper: the authors claim that they have conclusive evidence that + the system did not work (although I have suggested a few additional lines + of inquiry below that might problematize this assertion). + + The paper is very well-written and has some large positives. It also is a + negative results paper, and the committee will have to decide how to + handle this. In general, I’m strongly sympathetic to arguments to + include more negative results papers in our proceedings, but I’m quite + unclear on the details of how to do so (e.g. what defines a top-quality + negative results paper?). I’m hopeful that this paper can instigate a + broader discussion on this topic at the PC meeting. + + All of that said, this paper also has a number of idiosyncratic + limitations that make it perhaps not the best trial balloon for negative + results papers. Below, I outline what I believe to be the paper’s + positives and then describe these limitations in more detail, phrased as + both critiques and questions. + + Overall, my recommendation is to invite the authors to revise and + resubmit. If this occurs, I’ll want to see the below critiques + addressed and the below questions answered (both through direct answers + in the response to reviewers and through clarifications and changes to + the paper). I’m hopeful through, through the R&R process, this paper + can become an ideal negative results trial balloon. + + + Important positives: + + * The authors built a system to solve a real-life problemand did a + real-life, relatively large-scale deployment. Awesome! + * The paper is easily in the top 95% in terms of writing quality. This is + true both at the sentence level and at the narrative level. As a person + who has to review lots of papers, this was a breath of fresh air. + * The design of the game is quite well-thought-out, save a few relatively + arbitrary decisions. I was particularly compelled by the use of + gameification techniques that are also present in “real Wikipedia” + (e.g. barnstar-like rewards). + + Critiques: + + CRITIQUE #1 – Excessive import placed on trivial self-report data: It + is well-known that self-report data from participants is inferior to + observations of actual behavior, and that self-report data can be quite + unreliable more generally. As such, in my view, it is not a contribution + to show that self-report data didn’t end up panning out in the + behavioral results. + + In the next draft of this paper, I would like to see the authors address + this issue. This might mean framing this paper as a full-on negative + results paper, but lighter weight adaptations might be possible. + + + Open questions: + + QUESTION #1: As noted above, this paper is a negative results paper at + its core, and we’ll have to have a broad discussion about this at the + PC meeting, assuming the paper makes it this far. In the event that this + occurs, can the authors provide a more robust argument as to why these + negative results are important for other researchers and practitioners? + + The paper attempts to argue that one contribution that comes out of its + negative results is to distrust self-report data, but this is well-known + (see below). The other negative results argument in the paper is that + these results add to growing evidence of long-term gameificiation + failures. I find this argument much more compelling. In other words, by + expanding on this argument, the authors may be able to address this + question. + + That said, regardless of how this question is addressed in the second + draft, I’d like to see it done both through changes to the paper and + through discussion in the response to reviewers. + + QUESTION #2 – Is there a possibility that the statistical framework + employed is not appropriate for this particular study? + + The authors utilize a two-level statistical approach that I haven’t + seen before in the CSCW/CHI literature. I enjoyed thinking about this + approach, and the authors did a relatively good job explaining it. That + said, I’m currently not convinced that it was the appropriate framework + for this study. Here’s my reasoning: + + (1) The goal here is to introduce a treatment that ultimately will + produce strong new members of the Wikipedia community at a higher rate + than the control. + (2) Let’s say the game produces 3 such members out of 100 new editors + and the control produces 1, which looks like it might be the case. + Let’s also say that this pattern additionally persists over a large n. + (3) If this is true, why do we care about the potentially moderating + effect of the invitations? + + The authors argue that new editors that responded to the invitation to + play the game might just be new editors who are engaged and, critically, + would have been power editors whether or not the game existed. However, + barring a random fluke, shouldn’t these future power editors also have + been in the control group? If I’m right here, I’m thinking the + invitation doesn’t matter and a more traditional statistical analysis + (or at least one targeted at identifying rare events) is appropriate. + + I could be wrong, but I want the authors to respond to this question, + both through feedback to reviewers and clarifications in the paper. + + As an important side note, if we agree that this framework is the right + way to go in the end, the authors should puff their chests more about + this by claiming it as a contribution (assuming it hasn’t been used at + CSCW before). + + Question #3 – Are the outcome variables considered here the best + outcome variables? Are some critical variables missing? + + The authors seem focused on the average effects across the entire control + and treatment groups (the two treatment groups, to be specific). However, + would it not also be reasonable to consider the metric I describe above: + the % of new editors that go on to be power editors? Since power editors + end up contributing most of the edits anyway *over the long term*, to me + this seems like the way to go (i.e. if this group of editors were + followed for years, statistically significant differences would begin to + emerge). If the authors agree, the authors need to reanalyze their data + with this metric in mind. + + Another related outcome variable that might be useful to analyze is how + long the new editors in each group remained active editors in the + community (i.e. survival analysis). Because the data is quite old, this + should be an easy new analysis to run, and longevity has been a variable + interest in a number of peer production studies. + + In their second draft and the feedback to reviewers, I would like to see + the authors discuss either new analyses related to power users or why thy + did not consider this outcome variable. I would also like to see the same + for survival analysis. + + QUESTION #4: Is there a path towards positive results? + + As noted above, I believe some discussion around this paper and negative + results papers more generally will have to happen at the PC meeting. + However, I think there are so missed opportunities here for positive + results and that the authors were too quick to settle for negative + results. This is likely an important factor to consider when deciding + whether to accept a negative results paper. + + Most notably, there are several, well-motivated unexplored avenues that + could lead to positive results that would have a much larger impact than + the negative results presented here: + + * As noted above, examining additional outcome variables is important, + most notably # of power editors and longevity. + * Does the game work if folks are forced to play it prior to editing + Wikipedia, as would be the case in most other institutionalized + socialization contexts? This is not just a hypothetical: this game could + be used in all Wikipedia Education Project classes and related endeavors. + +Formatting and Reference Issues diff --git a/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-reviews-round2.txt b/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-reviews-round2.txt new file mode 100644 index 0000000..ba09a7c --- /dev/null +++ b/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-reviews-round2.txt @@ -0,0 +1,1064 @@ +From: +Date: Tue, Sep 6, 2016 at 9:26 PM +Subject: CSCW 2017 notification - #516 +To: snehanarayan@gmail.com +Cc: papers2017@cscw.acm.org + + +Dear Sneha Narayan - + +We are pleased to inform you that your paper: + +516 - The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + +has been accepted to CSCW 2017. Congratulations! This year we received 530 +submissions, of which 183 have been accepted for presentation at the +conference. + +We are writing to provide your second round reviews, and to give you +important information related to submitting your camera-ready paper and +presenting it at the conference. + +First, your reviews are provided below. Please read these carefully and +make sure your final submission of the camera-ready paper is as good as +possible. In many cases reviewers have suggestions or requests that will +improve your paper. + +Your next step is to prepare your camera-ready paper, which must be +submitted into the PCS system by October 31, 2016. You will also be +contacted by Sheridan Publishing, or directly by us, with specific +information about producing an appropriate PDF, choosing among ACM +copyright and license options, etc. Please pay special attention to the +citation format used by CSCW (e.g., author’s first name spelled out first, +but sorted by family name). All papers must be submitted in camera-ready +form to be included in the conference program. + +Please note that an author of each paper must register for the conference +and attend it to present the paper. Papers without a registered presenter +will be removed from the proceedings. Registration will open in the Fall, +with the greatest discounts available until the early registration deadline +of January 11, 2017. Please be sure that at least one author registers by +that date. Also, please let us know if the presenting author is someone +other than the contact author for this paper so we can appropriately reach +that person with any needed information. Finally, if you are coming from a +country where a visa is required to visit the US, please be sure to start +the process of getting that visa early. + +Soon after October 31st we will post a presentation schedule on the CSCW +2017 website so you can plan for your presentation time. All papers will be +presented in slots of just over 20 minutes, so you should plan on a talk of +15-17 minutes with 3-5 minutes for questions. + +Finally, if your work involves an innovative system that would be +appropriate to demonstrate, we'd like to encourage you to submit a +demonstration to CSCW 2017 as well (deadline: November 4). Details at +https://cscw.acm.org/2017/submit/demos.php. + +Again, congratulations! Thank you for submitting your work to CSCW 2017 and +we look forward to seeing you in Portland! + +Louise Barkhuus +Marcos Borges +Wendy Kellogg +CSCW 2017 Papers Chairs + +------------------------ Submission 516, Review 4 ------------------------ + +Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + +Reviewer: AC + +Expertise + + 2 (Passing Knowledge) + +First Round Overall Recommendation + + 3 (Maybe acceptable (with significant modifications)) + +Contribution and Criteria for Evaluation + + The paper presents the design and evaluation of a gamified tool for + socializing and retaining new Wikipedia editors. Contribution criteria + include (1) a description and rationale for the system; (2) system + novelty and rationale for how it leads to learning; and (3) a + methodologically sound evaluation. + +First Round Review from AC (if needed) + + +Coordinator's First-Round Report to Authors + + The paper presents the design and evaluation of a gamified tool for + socializing and retaining new Wikipedia editors. The study found that + users liked—but did not learn from—the system. + + The focus on improving the experience of newcomers in Wikipedia is + relevant and important. Reviewers describe the study as well motivated + and exceptionally well-written. Read R3’s comments on the writing + quality and congratulate yourself! + + The reviewers, however, have many concerns about the paper—each + focusing on a different aspect of the work. The concerns the reviewers + note /may/ be addressable during the revise and resubmit period, but it + will be an exceptionally herculean effort. Also, please keep in mind that + there is no guarantee of acceptance even after making changes. So, it is + at the authors’ discretion about whether or not to proceed with + revisions or withdraw the paper. + + There is split amongst the reviewers as to whether the failure of the + tool is interesting or not. R1 raises concerns that the failure of the + tool could be predicted from existing literature, suggesting little + rationale for doing the work in the first place. R2 asks whether there + is something fundamentally different about people who continue to + contribute to Wikipedia, and as such whether the system holds value in + practice. R3, on the other hand, sees much value in the systems + contribution of the work as well as the real-world evaluation. R3's + review has some suggestions of alternative framings that may make the + contribution more valuable. + + In treatment of related work, many improvements are needed. R1 notes that + the discussion of the well-known concept of legitimacy/authenticity in + learning environments is missing. R2 also points to missing literature + about Wikipedian experience. + + R2 and R3 raise a number of methodological questions about the paper. R2 + suggests the distribution of participants across the timeline may bias + the results. R3, on the other hand, sees opportunity here, suggesting + additional statistical analysis related to longevity and power users. + Both R2 and R3 question the methodological choice and contribution of + measuring perceptions of learning rather than actual learning. Overall, + this points to a need for at the very least justifying the methodological + choices and at the most carrying out additional statistical analyses. + + In summary, there is quite a bit of work to be done. I wish the authors + the best of luck, should they choose to continue in the review process. + + +Requested Revisions + + REQUIRED: + - Provide justification for why the study was worth carrying out, in + response to R1 and R2’s concerns. R3’s review may have some insight + into alternative framings. + - State the research questions more explicitly, as per R2’s + recommendation + - Address R1 and R2’s concerns about missing literature + - Ensure that the narrative around Wikipedia is clear to readers who +do + not have an in-depth background in production/editing details. + - Improve the clarity of the results by using percentages or another + baseline that allows comparison between numbers, as per R2’s review. + - Provide justification for measuring perceptions of learning versus + actual learning. + - Provide a robust discussion of why the results are meaningful for + researchers and/or practitioners. + + OPTIONAL, RECOMMENDED + - Consider carrying out additional statistical analyses as +recommended by + R3. + - Provide a short justification for use of English language +Wikipedia, as + per R2’s review. + +Formatting and Reference Issues + + +Author Response + + Most or all of my comments were addressed. + +Final Rating of Revision + + 4 (Probably Accept) + +The Review of Revision + + The authors addressed the majority of reviewer concerns during the + revision period. The explanation of study purpose and specific research + questions is much clearer. The authors filled the gaps in the related + work. Wikipedia-specific jargon was reduced. The data were reanalyzed to + address reviewer concerns about temporal aspects of the data. + Where the paper still struggles a bit is in clearly articulating its + contribution. + + As such, the reviewers are largely positive, but also have significant + reservations about the work. In what is otherwise a solid research study, + the authors have struggled with clearly articulating how the work + contributes to - and challenges - existing research knowledge. + Ultimately, I think this is an issue of the paper just needing a bit more + tweaking to the narrative. The reviewers all note ways in which the paper + does provide a counterpoint to existing research. + + In particular, R1 notes that the findings of the paper are different than + what one would expect given Halfaker's work on newcomer enculturation. + This previous work indicates that interventions to boost confidence + should have had a positive effect. But in this study, it didn't happen. + So although the authors may have not called out this issue as explicitly + as they could, it does appear that the the work does have some + contrasting findings research about wikipedia newcomer enculturation. In + addition, there are (perhaps understated) implications for gamification + research which R3 describes as "fascinating." + + To sum up, the paper is right on the borderline for acceptance at a + top-tier conference, and I would lean toward accepting it. It's not a + perfect paper, but I am very encouraged by the amount of thought and + conversation this work has raised amongst the reviewer. I'm also + encouraged by how the work has cross-cutting implications across three + research areas: learning, online peer production, and gamification. So + there is potential for rather broad appeal. + +Coordinator's Final Report to Authors (meta-review) + + Congratulations on acceptance! The paper was discussed at the program + committee meeting and was positively received. While no changes are + mandatory, we do recommend taking a look at the second round reviews + which have some additional suggestions for improvement. + +Remaining Formatting and Reference Issues + + +Report completed + + Completed + + +------------------------ Submission 516, Review 1 ------------------------ + +Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + + +Expertise + + 4 (Expert) + +First Round Overall Recommendation + + 3 (Maybe acceptable (with significant modifications)) + +Contribution and Criteria for Evaluation + + In this paper, the authors present the design and two-pronged evaluation + of a tutorial for new wikipedia editors that uses elements of + gamification like missions and badges to help coach new editors and help + them learn best practices and social norms of Wikipedia. The outcome is + that users like the system but, based on behavioral measures, they don't + actually learn from it. Learning interventions are a classic kind of + research problem, and the paper should include robust measures of + learning, as well as a good description of the designed intervention + itself, why the design is expected to lead to learning, and a clear + description of the study. + +First Round Review + + This is a reasonably well motivated study with connections to appropriate + literature and the writing is engaging and understandable. The problem of + enculturating newcomers into projects like Wikipedia is well documented + and this paper investigates a potential intervention with an admirably + well-planned study. Designing learning interventions is really difficult + and I commend the authors on a well-executed effort. + + Still, I am ambivalent about the paper because I would have predicted + these outcomes based on the literature alone. In the discussion, the + authors note that one mismatch between Wikipedia and the tutorial as + designed involve the “gradual peripheral participation” of newcomers + as they take on the identity of “Wikipedian.” They suggest that maybe + speeding up this process is unnatural. I would argue that the most + important concept from the literature on learning is missing from this + discussion, and that’s “legitimacy” (also sometimes referred to in + education and learning literature as “authenticity”.) The authors + explain that by doing tasks in a pretend version of Wikipedia, they make + it a safe space for newcomers to practice, yet performing “canned” + tasks in a pretend system is the opposite of offering a legitimate form + of participation. I immediately wonder, why not use what we know from the + literature to create low-risk missions that newcomers can complete while + legitimately contributing to the encyclopedia? Risk taking is a + fundamental characteristic of games that makes them engaging; it + certainly seems like it would play a role in people’s motivation in a + scenario like this. Rather than eliminating risk, the literature on + legitimate peripheral participation would suggest that finding the right + degree of risk is required to facilitate progressive entree into a set of + shared practices. + + I am disappointed by the missed opportunity here, the outcome mainly + seems to verify that what we know shouldn’t work based on the + literature in fact doesn’t work. Yet still the paper isn’t bad and + the study is carefully crafted and reported. + + With some extension and reflection, I think the discussion could help + point future research in a more fruitful direction. There are millions of + pages written on the challenges of designing learning interventions that + change people’s behavior, this paper ends on a painfully obvious note. + It’s true that usability isn’t all it takes, but what can we learn + from TWA adventure about the design of systems to facilitate + enculturation into a community of practice? What can we take away from + this that might inform more successful tutorial systems in the future? + +Author Response + + Most or all of my comments were addressed. + +Final Rating of Revision + + 4 (Probably Accept) + +The Review of Revision + + The revised version of the paper has addressed many of the issues raised + by me and other reviewers and, in some cases, even presents new analyses + to address critiques. My initial response to the study was that, knowing + all that I do about the literature on learning, I don't see why an + intervention like TWA should be expected to work. The authors have + addressed that problem by including citations to Guzdial and Tew that + suggest inauthentic learning experiences can be effective sometimes. It's + fine to point out exceptional cases - it doesn't change the fact that + generally the literature would lead us to expect such interventions not + to work. And it didn't. The fact that the authors thought it would work + and can rationalize it post-hoc isn't an argument that the results are + surprising. What might have been more compelling was the observation that + confidence was bolstered and therefore should lead noobs to overcome the + problems outlined in halfaker's "don't bite the newbies" paper - yet it + didn't. The findings seem to suggest that Wikipedians are, indeed, born + and not made. Either that or this simply isn't a good way of + enculturating newcomers. Maybe people didn't find the tasks to be + authentic in the context of an imagined community? Maybe people still + think Wikipedians are mean and scary even if their confidence has been + bolstered? Maybe gamification isn't a great way to engage would-be + encyclopedia writers. + + I have raised my score to a 4 because the authors have done a good job in + writing about a negative result, but I clearly also have some serious + reservations. The findings suggest that Wikipedia needs to work on + welcoming people, but we already knew that from the "don't bite the + newbies" paper. In the end, I feel that although the paper presents a + negative finding well, the many alternative explanations don't provide a + satisfying narrative that leaves a reader with answers or theoretical + insight. The phenomenon that learners report they *like* or enjoy a + learning intervention that has no impact on their behavior or learning + is, unfortunately, all too familiar. (It's almost remarkable in that the + intervention didn't even elicit a short-term Hawthorne Effect.) + +Remaining Formatting and Reference Issues + + + +------------------------ Submission 516, Review 2 ------------------------ + +Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + + +Expertise + + 4 (Expert) + +First Round Overall Recommendation + + 2 (Probably NOT acceptable) + +Contribution and Criteria for Evaluation + + This paper's contribution is the design and evaluation of a structured + introduction to a peer production community (English Wikipedia) called + "The Wikipedia Adventure". TWA's design is rooted in theories of + gamification, and its utility is evaluated through a user survey and an + invitation-based field experiment. The paper reports on the survey + respondents' satisfaction with TWA, and how their experiment results + reveal some of the challenges of affecting lasting changes to contributor + patterns in peer production communities. These findings are then + discussed in relation to cultural factors in Wikipedia, issuses of + self-selection and voluntary participation, and the limitations of + gamification. + + When evaluating a paper that describes the design of a system, the two + main criteria are that the system and/or its development setting is/are + novel, and that the way the system is evaluated is methodologically + sound. + +First Round Review + + As mentioned in the contribution section, this paper's contribution is + the design and evaluation of a structured introduction to a peer + production community based on gamification, called "The Wikipedia + Adventure". This is a great idea and sounds like a useful addition to + Wikipedia. The paper is written in a way that makes it easy to read, and + provides the reader with a good introduction to how TWA's design is + rooted in theories of gamification, thus applying these principles in + what appears to be a novel setting. The paper also does a good job of + discussing the findings, organizing them in a way that is easy to follow + and touching on important points (e.g. cultural factors, and the + limitations of self-selection and gamification). + + The overall ideas and approach taken in this paper are sound, they are in + line with the criteria described previously. Unfortunately, there are + two major issues and several minor ones that need to be resolved before + this paper is ready for publication. The first major issue is that the + methodology used to evaluate performance in the invitation-based + experiment measures contribution in a skewed manner and does not + establish why that is appropriate. Secondly, the paper fails to consider + arguments put forth by Panciera et al's "Wikipedians are Born, Not Made" + paper. This review will expand on both of these major issues below. + Further below will be notes and comments with suggestions for improvement + for specific sections of the paper, some of which are rather substantial + as well. + + 1: Evaluating TWA effectiveness by number of contributions + ---------------------------------------------------------- + + A major part of the paper is the evaluation of TWA's effect on subsequent + contributions. To evaluate this an invitation-based field experiment is + used, and the paper does a great job of justifying why that is + appropriate in this setting. The experiment runs from February 2014 and + three months forward. Exact dates are not given, so let us assume that + it ran until the end of April 2014. User contributions are then measured + until the end of May 2014. + + There are two problems with this approach that the paper fails to address + properly. One is the issue of right-truncation found in the data. + Contributors who joined in early February 2014 would have about four + months to make edits, whereas those who joined in late April would only + have about a month. The model does contain a control variable for number + of days in the experiment, but why is that appropriate in this context? + If we examine other work in the same domain, they tend to either use a + much longer time period (e.g. the Teahouse paper, citation 23, which uses + 6-9 months) or ensure that the time period is fixed (e.g. Kittur et al. + "Herding the Cats: The Influence of Groups in Coordinating Peer + Production", WikiSym 2009; or Zhu et al. "Effectiveness of Shared + Leadership in Online Communities", CSCW 2012). + + Related to the right-truncation problem is the fact that the paper also + fails to discuss and justify what a reasonable timespan for measuring the + effect of TWA is, and that it will have an effect on the number of + contributions made. It might for instance be that TWA instead has an + effect on how long it takes before a user drops out of the system. If we + assume that TWA has an effect on contributions, what timespan is needed + to measure that effect? The paper assumes that a month is adequate to + discover it, whereas one might suspect that it is only measurable over a + longer period of time. If it is the case that a short period of time is + appropriate (for instance because these users are likely to drop out + after a certain amount of time) the paper needs to properly establish + that, either by measuring it or referring to previous work. + + 2: Wikipedians Are Born, Not Made + --------------------------------- + + In their GROUP 2009 paper "Wikipedians Are Born, Not Made: A Study of + Power Editors on Wikipedia", Panciera et al. show data that argues that + those contributors who are going to stick around behave in a way that is + different from the very beginning. In followup work published in 2010 + they find similar differences in another peer production community. + (Panciera et al. "Lurking? cyclopaths?: a quantitative lifecycle analysis + of user behavior in a geowiki." CHI 2010) + + These two papers and the argument they put forth are relevant because + they question who TWA is designed for. In the related work a reference to + Bryant et al's "Becoming Wikipedian" is made, thereby suggesting that TWA + is designed to teach someone how to be a Wikipedian. As Panciera et al's + paper argues along the lines of these contributors already being + Wikipedians, should TWA be designed to instead help these contributors + stay productive? + + If Wikipedians are born, not made, then one could also question whether + these contributors are at all going to use TWA. Maybe they ignore TWA + because they are already productive and do not need it? Since the paper + never makes any references to these papers and discusses issues related + to this (e.g. "is the Teahouse more effective since it allows them to get + answers when they need help?"), this whole topic area is left hanging. + + --- + Below follows comments/notes for each section of the paper. + + Introduction: + * An overall issue here is that there are few citations to sources. For + instance a claim is made that "newly created accounts are the primary + source of spam and vandalism on Wikipedia". Consider a "[citation + needed]" added after that. + * When citing multiple papers it is preferable that they are in order, + e.g "[14, 23, 17]" should be "[14, 17, 23]" (page 1). This minor issue + also occurs elsewhere in the paper. + * "Unlike prior systems, TWA creates a structured experience that guides + newcomers through critical pieces of Wikipedia knowledge..." Do we know + that there are no other prior systems that offer a similar experience? It + might be that there are none within the Wikipedia domain, but what about + outside it? That sentence is making a rather bold claim. + * After reading the introduction, what is the reader expected to remember + as the main findings in this paper? At the end of the introduction the + following sentence is found: "The study underscores the importance of + conducting multiple types of evaluations of social systems." Is that the + main contribution? What about the implications for gamified structured + introductions to peer production? + + Background: + * "...women reported that they found that contributing to Wikipedia + involved a high level of conflict and that they lacked confidence in + their expertise [8]. This suggests that more effective onboarding tools + could help incorporate newcomers." This is an important side of + Wikipedia, but how does TWA's design help mitigate this issue? Are there + design elements in TWA that aims to boost confidence in one's expertise? + * At the end of the introduction we find the following two questions: + "Would a gamified tutorial produce a positive, enjoyable educational + experience for new Wikipedians? Would playing the tutorial impact + newcomer participation patterns?" These are the paper's _research + questions_! It would be very helpful to the reader if they were displayed + more clearly, e.g. as separate items. They should not be hidden. + + System Design: + * "...it does not depend on the availability, helpfulness, or + intervention of existing Wikipedia editors..." The underlying argument + here is that scalability is preferable to personal interaction when + socializing newcomers (in peer production communities). Why is that the + better solution? As discussed previously, TWA might be designed for + contributors who are not going to stick around, why are those the right + audience for it? Is the goal to provide _everyone_ with a scalable + impersonale introduction, or is it better to provide _some_ (typically + based on self-selection) with a personal introduction (e.g. the + Teahouse)? + + Game-like elements (subsection of System Design): + * In "Missions" a distinction is made between "basic" and "advanced" + editing techniques. It appears to be somewhat arbitrary, why is adding + sources advanced editing, but watchlists are not? + * Your readers might not now what watchlists are, take care to write for + a general audience, not everyone knows a lot about how Wikipedia works + behind the scenes. + + Study 1: User Survey: + * This paper doesn't discuss any other language editions of Wikipedia + besides the English one, and makes the assumption that "Wikipedia" equals + the English edition. Adding a mention that Wikipedia exists in multiple + languages and explaining why English was chosen as the language where + TWA was launched would be very helpful. + * The paper aims to measure "educational effectiveness". Why is a survey + the appropriate way to measure that? Based on the description of the + survey, it seems that it never asks specific questions to test whether + TWA's users learned specific things, in other words whether the education + was successful. Later when describing the results the phrase "learning to + edit Wikipedia" is used, isn't that the _key_ learning goal of TWA? Yet + the survey asks Likert-scale questions. In other words, you're measuring + whether TWA users are under the impression that they learned something, + not whether they actually did. + * Figure 4 uses counts. While it shows that none of the questions had + responses from all participants, it makes comparisions between questions + with different response rates very difficult. Using percentages would + allow for direct comparisons, and makes the references to the figure in + the text easier to follow along with. The text refers to four questions + with a certain percentage of responses, but leaves the math to the + reader. + * The survey leaves many questions unanswered, some of which the paper + might want to address. Were any negative questions asked? Were there any + control questions, such as a similar question worded slightly differently + to allow for comparison between responses? As it is, this survey comes + across as a set of positive statements about TWA that respondents agreed + to. Given that respondents self-select and no attempts to contact users + who didn't go through TWA appears to have been made, it is likely there + is a bias in the responses, and that bias should be discussed. + + Study 2: Field Experiment: + * The description of how accounts were selected to be included is rather + confusing. First it describes 1,967 accounts that met the same criteria + as for the user survey, however 10,000 individuals ("accounts"?) were + invited to the beta. Why is one an order of magnitude larger than the + other? Then in the second paragraph of "Methods" it describes the + selection criteria, that at least one contribution would have to be made + after getting invited. This would perhaps be much less confusing if the + criteria were first explained, particularly how the experiment and + control groups were set up, and then how many accounts were identified. + * "This is a larger proportion of users than took up the invitation in + Study 1, which may be due to changes in the invitation text." Earlier in + the paper study 1 refers to a "beta", whereas this appears to be not. If + this is the case, this is an important difference between the two that + should be made clear to the reader. + * "we measure the overall contributions as the total number of edits made + by each account from the time of inclusion in the study until May 31, + 2014." When exactly is "time of inclusion", is that when they got the + invite? What about when they completed one (or all) TWA mission(s)? The + concern here is that all contributions are measured, whereas the + experiment sets up a pre/post-scenario. Later on the paper refers to + "subsequent contributions", indicating that contributions after a certain + point in time was measured. This quickly becomes rather confusing, + spelling out clearly what points in a user's account history is used + (e.g. "we measure contributions at four points in time: when the user + registered their account, the time of invitation, when they first started + using TWA, and the end of the experiment") would be very helpful. + * Why is a six-edit radius chosen when measuring word persistence? + Halfaker et al. make no claim about what the radius should be in the + referenced work, and Ekstrand et al suggest a 15 edit radius in a related + paper (Ekstrand and Riedl "rv you're dumb: identifying discarded work in + Wiki article history." WikiSym 2009) The six-edit radius also comes with + an issue that is unadressed: how long does it take for an edit made by a + contributor in the study to reach that six-edit radius? If it hasn't been + reached at the end of the study period, that edit has to be discarded as + its quality is unknown. In a related paper, Farzan and Kraut instead + chose to use percentage of words that survived as a measure of quality + (Farzan and Kraut "Wikipedia classroom experiment: bidirectional benefits + of students' engagement in online production communities" CHI 2013) + * Tables 1, 2, 3, and 4, as well as figure 6 should be brought closer + together so it's easier to follow along. Table 1 occurs before the text + that refers to it, and table 4 is two pages further along. Putting all + tables and figure 6 on the same page might be a good solution. + * Table 3 refers to users "reached" a mission. It is confusing how 181 + users reached the final mission but did not complete it, yet in the text + it seems these 181 users actually did. + * The post-hoc power analysis is very useful! + + Discussion: + * "The new editors in our study may have had unpleasant experiences + during their initial time on Wikipedia..." It appears that the survey + asked no questions about this, yet is it not a very important issue + related to TWA's success? + * In "Limitations of gamification" the following sentence is found: + "...our study is among the first that compares levels of participation in + a task among individuals who were introduced to gamified learning first + to those that were not." This is an _important_ finding, it shouldn't be + hidden back here but instead be up front in the introduction! + +Author Response + + Most or all of my comments were addressed. + +Final Rating of Revision + + 5 (Definitely Accept) + +The Review of Revision + + First of all, this reviewer would like to congratulate the authors on the + herculean effort that's been put into improving this paper and the + quality that has resulted from it. The attached revision document is also + of high quality, carefully considering the comments from the reviewers + and arguing well for why some of our suggestions were not implemented. + + After carefully reading the revised version, my final recommendation for + this paper changes to a 'Definitely Accept'. There are several reasons + for why this paper ought to be included in the conference proceedings: + + 1: It is a well-written paper. This was implied in some of my previous + comments, as well as R3's applause. The revised version is no different + from the initial one, the changes kept with the clear writing style and + the content changes have further improved the paper. + + 2: The literature on newcomer interventions in peer production + communities (such as Wikipedia) is sparse. Much work has studied what + happens to newcomers in these communities and proposed solutions, but + large-scale interventions are few and far between. This paper therefore + starts filling that gap in the literature. + + 3: The design of TWA is well-founded. As the paper argues: + + "The design of our system was informed by previous empirical, + theoretical, and systems work and our system performed well according to + the types of survey self-report measures used to evaluate the usability + of many social computing systems." (revised version, page 12) + + There are strong reasons to believe that this intervention _should_ work, + partly due to the positive responses in the survey, as well as previous + research on gamification. In other words, the fact that it didn't work is + arguably a noteworthy result in and of itself. + + 4: Figuring out why these types of interventions fail and/or what types + of interventions succeed is probably on the order of a lifetime's worth + of research work. As the authors argue in the revision notes, documenting + these failures is important. We don't need to document _all_ of them, but + given that this one describes an intervention with a reasonably solid + foundation in previous work that indicated it was likely to succeed, it + should be a sufficiently interesting example that the community will + benefit from having documented. + + 5: Connected with the last sentence in point #4 (it's a sufficiently + interesting example), this paper's negative result can initiate a + discussion around and motivate future research in this space in order to + uncover what factors lead to a successful intervention in this space, as + well as further document failures. + + Those things being said, there's still a bit of room for improvement in + this paper, here are some final notes and suggestions: + + Introduction: + In the introduction, the phrase "Social computing systems that aggregate + voluntary contributions" is used, while in the conclusion the phrase + "peer production" is used. Consistent terminology usage is useful. + + Awkward phrasing: "(a limited resources)" + + Awkward phrasing: "…how new users perceive to the system’s design and + tone." + + Background: + In the section "Why Gamify Becoming a Wikipedian?" a reference to + Kriplean et al.'s barnstar work is made, "…badge-like social awards + which confer external recognition of their achievements [31]." Something + that wasn't mentioned in the previous review is that Kriplean et al. + found that barnstars take a rather long time to be awarded (see their + footnote 2, page 3), median edits is around 1,200 and median tenure is + around a year. This suggests that barnstars are a rather slow process, + creating somewhat of a broken feedback loop. TWA's faster achievements, + and perhaps other solutions such as "WikiLove" and "thanks", can be seen + as important improvements since they close the loop much faster. The lack + of (positive) feedback on wikiwork is maybe one of the reasons newcomers + don't stick around. (While that's not necessarily a suggestion for + changes to this paper, it's perhaps something worth keeping in mind for + future work) + + In the same section, a reference to GettingStarted is made. This + reference points out some of GS' features, which seem to be very similar + to some of what TWA does. The claims in the introduction that are made + about TWA's novelty in creating a structured experience for newcomers are + therefore maybe a bit strong? + + System Design: The Wikipedia Adventure: + In the section "Game-like elements", subsection "Missions", a reference + to setting up a user page is made. It might be useful to explain to the + reader why creating a user page is important (e.g. that non-existent + user/user talk pages are a flag to patrollers, or that it can signal + stronger commitment to the community). + + Study 1: User Survey: + The study and the results in Figure 5 are referred to in the text as + measuring “user confidence” and “user engagement”. The two + rightmost questions don't fit into that, they instead measure the + participants' perception of whether TWA would be useful for other + newcomers in the community. In addition to lacking reference in the text, + an issue with this evaluation is that it's only asking one specific group + to evaluate it. There's no survey of experienced contributors, for + instance whether they perceive TWA participants to be "better" + contributors. Lastly there is also the issue of whether a newcomer to the + Wikipedia community is able to properly consider whether TWA is a good + way to introduce newcomers, partly because they might not know what's + missing. + + In the results section the following claim is made: "These findings + provide validation of our choice to gamify the tutorial." That conclusion + doesn't appear to be supported. The survey questions don't appear to poke + at whether the gamification elements of the tutorial were the reason for + the positive responses. It instead appears that we have a general + evaluation of the perceived utility of TWA. + + Study 2: Field Experiment + Table 2 in the results is still somewhat confusing. Using "Attrition" + doesn't seem to work well either, since there appears to be 181 + participants who either started or completed mission seven. Maybe it's + just that there's no mission eight, so the categorization scheme becomes + difficult? Either way, some way of clarifying what happened to these 181 + participants would be helpful. + + Figure 6 might benefit greatly from the Y axis being log-scale for the + edit counts, given that the distributions are so skewed. Not sure there's + much usefulness in a box plot if there's no box. + + Discussion: + "The null findings in these models indicate that the people who played + the game and went on to contribute extensively would have done so + anyway." + So are Wikipedians born, not made, then? :-P + +Remaining Formatting and Reference Issues + + Looks like the 15th page can be spared if the references are shortened, + for instance by removing "ACM, New York, NY, USA" from all the ACM + references, shortening the proceeding names, etc… Given that there + isn't strictly a page limit, it's arguably not that necessary, but + perhaps worth considering. + + +------------------------ Submission 516, Review 3 ------------------------ + +Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial +for New Users + +Reviewer: AC-Reviewer + +Expertise + + 4 (Expert) + +First Round Overall Recommendation + + 3 (Maybe acceptable (with significant modifications)) + +Contribution and Criteria for Evaluation + + This paper presents the results of a deployment of a gameification-based + system designed to retain new editors in Wikipedia. It is a negative + results paper: the authors claim that they have conclusive evidence that + the system did not work (although I have suggested a few additional lines + of inquiry below that might problematize this assertion). + + The committee will have to have a discussion about how to evaluate this + paper, and likely negative results papers more generally. + +First Round Review + + This paper presents the results of a deployment of a gameification-based + system designed to retain new editors in Wikipedia. It is a negative + results paper: the authors claim that they have conclusive evidence that + the system did not work (although I have suggested a few additional lines + of inquiry below that might problematize this assertion). + + The paper is very well-written and has some large positives. It also is a + negative results paper, and the committee will have to decide how to + handle this. In general, I’m strongly sympathetic to arguments to + include more negative results papers in our proceedings, but I’m quite + unclear on the details of how to do so (e.g. what defines a top-quality + negative results paper?). I’m hopeful that this paper can instigate a + broader discussion on this topic at the PC meeting. + + All of that said, this paper also has a number of idiosyncratic + limitations that make it perhaps not the best trial balloon for negative + results papers. Below, I outline what I believe to be the paper’s + positives and then describe these limitations in more detail, phrased as + both critiques and questions. + + Overall, my recommendation is to invite the authors to revise and + resubmit. If this occurs, I’ll want to see the below critiques + addressed and the below questions answered (both through direct answers + in the response to reviewers and through clarifications and changes to + the paper). I’m hopeful through, through the R&R process, this paper + can become an ideal negative results trial balloon. + + + Important positives: + + * The authors built a system to solve a real-life problemand did a + real-life, relatively large-scale deployment. Awesome! + * The paper is easily in the top 95% in terms of writing quality. This is + true both at the sentence level and at the narrative level. As a person + who has to review lots of papers, this was a breath of fresh air. + * The design of the game is quite well-thought-out, save a few relatively + arbitrary decisions. I was particularly compelled by the use of + gameification techniques that are also present in “real Wikipedia” + (e.g. barnstar-like rewards). + + Critiques: + + CRITIQUE #1 – Excessive import placed on trivial self-report data: It + is well-known that self-report data from participants is inferior to + observations of actual behavior, and that self-report data can be quite + unreliable more generally. As such, in my view, it is not a contribution + to show that self-report data didn’t end up panning out in the + behavioral results. + + In the next draft of this paper, I would like to see the authors address + this issue. This might mean framing this paper as a full-on negative + results paper, but lighter weight adaptations might be possible. + + + Open questions: + + QUESTION #1: As noted above, this paper is a negative results paper at + its core, and we’ll have to have a broad discussion about this at the + PC meeting, assuming the paper makes it this far. In the event that this + occurs, can the authors provide a more robust argument as to why these + negative results are important for other researchers and practitioners? + + The paper attempts to argue that one contribution that comes out of its + negative results is to distrust self-report data, but this is well-known + (see below). The other negative results argument in the paper is that + these results add to growing evidence of long-term gameificiation + failures. I find this argument much more compelling. In other words, by + expanding on this argument, the authors may be able to address this + question. + + That said, regardless of how this question is addressed in the second + draft, I’d like to see it done both through changes to the paper and + through discussion in the response to reviewers. + + QUESTION #2 – Is there a possibility that the statistical framework + employed is not appropriate for this particular study? + + The authors utilize a two-level statistical approach that I haven’t + seen before in the CSCW/CHI literature. I enjoyed thinking about this + approach, and the authors did a relatively good job explaining it. That + said, I’m currently not convinced that it was the appropriate framework + for this study. Here’s my reasoning: + + (1) The goal here is to introduce a treatment that ultimately will + produce strong new members of the Wikipedia community at a higher rate + than the control. + (2) Let’s say the game produces 3 such members out of 100 new editors + and the control produces 1, which looks like it might be the case. + Let’s also say that this pattern additionally persists over a large n. + (3) If this is true, why do we care about the potentially moderating + effect of the invitations? + + The authors argue that new editors that responded to the invitation to + play the game might just be new editors who are engaged and, critically, + would have been power editors whether or not the game existed. However, + barring a random fluke, shouldn’t these future power editors also have + been in the control group? If I’m right here, I’m thinking the + invitation doesn’t matter and a more traditional statistical analysis + (or at least one targeted at identifying rare events) is appropriate. + + I could be wrong, but I want the authors to respond to this question, + both through feedback to reviewers and clarifications in the paper. + + As an important side note, if we agree that this framework is the right + way to go in the end, the authors should puff their chests more about + this by claiming it as a contribution (assuming it hasn’t been used at + CSCW before). + + Question #3 – Are the outcome variables considered here the best + outcome variables? Are some critical variables missing? + + The authors seem focused on the average effects across the entire control + and treatment groups (the two treatment groups, to be specific). However, + would it not also be reasonable to consider the metric I describe above: + the % of new editors that go on to be power editors? Since power editors + end up contributing most of the edits anyway *over the long term*, to me + this seems like the way to go (i.e. if this group of editors were + followed for years, statistically significant differences would begin to + emerge). If the authors agree, the authors need to reanalyze their data + with this metric in mind. + + Another related outcome variable that might be useful to analyze is how + long the new editors in each group remained active editors in the + community (i.e. survival analysis). Because the data is quite old, this + should be an easy new analysis to run, and longevity has been a variable + interest in a number of peer production studies. + + In their second draft and the feedback to reviewers, I would like to see + the authors discuss either new analyses related to power users or why thy + did not consider this outcome variable. I would also like to see the same + for survival analysis. + + QUESTION #4: Is there a path towards positive results? + + As noted above, I believe some discussion around this paper and negative + results papers more generally will have to happen at the PC meeting. + However, I think there are so missed opportunities here for positive + results and that the authors were too quick to settle for negative + results. This is likely an important factor to consider when deciding + whether to accept a negative results paper. + + Most notably, there are several, well-motivated unexplored avenues that + could lead to positive results that would have a much larger impact than + the negative results presented here: + + * As noted above, examining additional outcome variables is important, + most notably # of power editors and longevity. + * Does the game work if folks are forced to play it prior to editing + Wikipedia, as would be the case in most other institutionalized + socialization contexts? This is not just a hypothetical: this game could + be used in all Wikipedia Education Project classes and related endeavors. + +Author Response + + Some of my comments were addressed. + +Final Rating of Revision + + 3 (Borderline) + +The Review of Revision + + After reviewing the change log and the new draft, I remain on the fence + about this paper. Below, I outline what I believe to be the key + discussion points about this paper in preparation for a likely + conversation with my fellow reviewers and at the PC meeting. First, + though, I outline some important positives to keep in mind as we have + this discussion: + + POSITIVES + + • This paper is a canonical systems paper, and one that has a strong + evaluation. The effort involved in putting together this paper is + probably 2-3x that of the average quant/qual paper. + • I think the implications of these findings for gameification +research + are very interesting, especially because they replicate and extend what + has been found in prior meta-analyses. (This receives too little + attention in the paper, though). + • The paper is very well written. + • The change log is by far the most detailed of any I have encountered + thus far this year, although I think the authors were a little stuck in + their ways in terms of actually making changes. + + Discussion point #1: What makes a good negative results paper? Does this + paper meet these criteria? + + The revision of this paper doubles-down on the “negative results as + contribution” message. This means that we as a committee have to define + the conditions for a high-quality negative results paper, and do so in a + way that won’t lead to moral hazards down the road. The authors did not + do a good job in their change log arguing why this paper is a good + negative results paper, instead making standard arguments about the + importance of negative results (without much recognition of the + challenges associated with evaluating them). Most if not all reviewers + should have already been aware of the argumentation in the change log. + + As far as I can tell, this paper implicitly and explicitly states that + the following is required for a good negative results paper: + + (1) A sample size that gives us relative confidence that moderate growth + of the experiment won’t lead to important effects in the end (e.g. we + might see significant results, but not ones of a meaningful size). + (2) A discussion section that helps to interpret the negative results so + that this paper can lead to some generalizable findings. + (3) The usual array of well-executed methods, excellent communication of + results, etc. + + Upon significant reflection, I tend to agree that these criteria do help + to turn a negative result into something that can be useful outside of + the specific experiment (a pre-condition to the acceptance of any paper, + in my view). However, I think we need to reflect on this more as a + community. + + Critically, we also need to decide if this paper meets the above + criteria, which is the subject of the next two sections of this review. + Overall, I think #1 and #3 are spot on with this paper, but #2 is weaker. + + METHODS + + I don’t think the authors understood my concerns about their + statistical approach. I do not take any issue with the two-level design. + I take issue with the interpretation. The authors themselves argue that + the invitation is an ecologically valid way to test the system, and to + me, that means that – at least to some degree – the invitation is + *part of the system*. After all, it would be necessary for its real-world + deployment. This makes the first level of the results quite important, + although I agree that the second level contributes to important + understanding as well. The original paper was written as a social science + paper would be and tried to control away the invitation. My point is that + the invitation can in many ways be considered the first interaction with + the system, a point with which I think the authors agree. This is one way + where this study is different than how it sounds like this technique is + often employed in the social sciences. + + The good/bad news is that it looks like with the new results, the point + is mostly moot: regardless of whether you consider the invitation as part + of the system or not, there is no effect (and if there were an effect, it + would be that the system made things worse; Note: This is assuming I’m + understanding things correctly: the authors use the term ‘control’ + and ‘treatment’ without referring to whether they are referring to + the first or second level). + + The only way in which this point is still an important one is that the + interpretation is somewhat strained in this new draft with regard to this + issue. I would confront it head-on in any future drafts. This would + involve presenting two interpretations of the system: one that includes + the invitation (which is ecologically valid in terms of how it would + actually be deployed) [level one] and one that does not [level two]. + + INTERPRETATIONS OF RESULTS + + I think the paper falls a bit short in interpreting what the results mean + (criterion #2 for a good negative results paper, as per above). The key + takeaway seems to be: designing gameified systems to support newcomers in + Wikipedia is hard. I’m not sure that’s good enough. + + The implications for the gameification literature continue to fascinate + me (this is a big plus for the paper in my book), but the authors write + in their change log that this is not the focus of the paper, and this is + reflected in the new draft. My thinking is that if some of the most + interesting implications of this paper are in this space, why not make it + the focus of the paper? I realize it’s not in the WP domain, but WP has + a great deal of value in social computing as a test bed for social + computing generally, not just studying WP. + +Remaining Formatting and Reference Issues + + +Report completed diff --git a/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-revision_summary.pdf b/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-revision_summary.pdf new file mode 100644 index 0000000..57c9616 Binary files /dev/null and b/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-revision_summary.pdf differ diff --git a/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-revision_summary.tex b/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-revision_summary.tex new file mode 100644 index 0000000..91ed888 --- /dev/null +++ b/cscw_changelogs/2017-the_wikipedia_adventure/twa-CSCW2017-revision_summary.tex @@ -0,0 +1,1030 @@ +\documentclass[12pt,letterpaper]{article} + +\usepackage[T1]{ fontenc} +\usepackage[utf8x]{ inputenc} +\usepackage{graphicx} +\usepackage[usenames,dvipsnames]{xcolor} +\usepackage[breaklinks]{hyperref} + +\hypersetup{colorlinks=true, linkcolor=Black, citecolor=Black, filecolor=Blue, + urlcolor=Blue, unicode=true} + +\usepackage[english]{babel} + +\usepackage[font=footnotesize,labelfont=bf]{caption} +\usepackage[margin=0.8in]{geometry} +\usepackage{parskip} +\usepackage[round]{natbib} + +\def\citepos#1{\citeauthor{#1}'s (\citeyear{#1})} +\def\citespos#1{\citeauthor{#1}' (\citeyear{#1})} + +\def\todo{{\normalsize\color{BrickRed}{TODO }}} +\def\done{{\normalsize\color{SkyBlue}{DONE }}} + +\begin{document} +\title{Revision Summary for ``The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial for Newcomers''} +\author{} +\date{} +\maketitle + +\vspace{-1.5cm} + +First and foremost, we thank the AC and the three reviewers for the time they have spent on our paper and for their detailed and very helpful feedback. As we hope you will see, we have taken this feedback seriously and spent a large amount of time and effort making an extensive revision of our manuscript to address the concerns that were raised. + +We believe that the changes we have made address nearly all of the substantive concerns raised by the reviewers. We have also responded to changes that were suggested but that we did not make. +In particular, we have made changes to address every issue called out by the AC including all of the revisions listed as ``REQUIRED,'' as well as one of the two changes listed as ``OPTIONAL.'' + +This document attempts to provide a comprehensive description of our revisions. We believe that the revisions described here have greatly improved the manuscript and we appreciate the time and effort that our anonymous referees have spent on our work. + +% \section*{AC's required revisions} + +% \begin{itemize} +% \item \done Provide justification for why the study was worth carrying out, in response to R1 and R2’s concerns. R3’s review may have some insight into alternative framings. [Bringing people from peripheral to active. Born not made is not an argument against design interventions; it's an argument that a specific system designs may enable certain participants from very early in their tenure. Study doesn't predict perfectly! Also, just because some people may become power users does not mean others cannot or that power users are permanently "that way" (they change over time in many ways).- AS Justify the importance of intervening in general for the sake of newcomer socialization in peer production communities. - Sneha] + +% \item \done State the research questions more explicitly, as per R2’s recommendation + +% \item \done Address R1 and R2’s concerns about missing literature + +% \item \done Ensure that the narrative around Wikipedia is clear to readers who do not have an in-depth background in production/editing details. + +% \item \done Improve the clarity of the results by using percentages or another baseline that allows comparison between numbers, as per R2’s review. + +% \item \done Provide justification for measuring perceptions of learning versus actual learning. [JMo: system goal - interested in retention/enjoyment, not testing learning! Engagement is the key outcome here. So we're evaluating that.][ads: Also add citation to the legitimacy/authenticity thing to discussion as a potential explanation of null effect] + +% \item \done Provide a robust discussion of why the results are meaningful for researchers and/or practitioners. + +% \item \done (Optional) Consider carrying out additional statistical analyses as recommended by R3. [Note (AS \& SN): R3 raises concerns that reflect a misunderstanding of both (1) the invitation-based design (and the role of ITT and 2SLS models) and (2) the fact that our analysis already addresses the potential for skewed/rare outcomes by using Negative binomial models and conducting robust, non-parametric tests like Mann-Whitney. We can address R3's concerns mostly by being clearer and more forceful about the justification for our analytic approach.] + +% \item \done (Optional) Provide a short justification for use of English language Wikipedia, as per R2’s review. +% \end{itemize} + + +\section{Time period for experimental data collection} + +R2 and the AC raised two closely related concerns about the time period over which users were observed during the experiment. R2 described the issue clearly: + +\begin{quote} +Contributors who joined in early February 2014 would have about four months to make edits, whereas those who joined in late April would only have about a month. The model does contain a control variable for number of days in the experiment, but why is that appropriate in this context? If we examine other work in the same domain, they tend to either use much longer time period (e.g. the Teahouse paper, citation 23, which uses 6-9 months) or ensure that the time period is fixed e.g. Kittur et al. "Herding the Cats: The Influence of Groups in Coordinating Peer Production", WikiSym 2009; or Zhu et al. "Effectiveness of Shared Leadership in Online Communities", CSCW 2012). +\end{quote} + +% To address these concerns, we have collected new data, produced new results, and made several other changes to address these concerns in our manuscript. + +\subsection{Ensuring consistent right-truncation} + +% \item \done {\bf Address right-truncation of the experimental study (Sneha has done this, and is working on incorporating new results)} R2: One is the issue of right-truncation found in the data. Contributors who joined in early February 2014 would have about four months to make edits, whereas those who joined in late April would only have about a month. The model does contain a control variable for number of days in the experiment, but why is that appropriate in this context? If we examine other work in the same domain, they tend to either use much longer time period (e.g. the Teahouse paper, citation 23, which uses 6-9 months) or ensure that the time period is fixed (e.g. Kittur et al. "Herding the Cats: The Influence of Groups in Coordinating Peer Production", WikiSym 2009; or Zhu et al. "Effectiveness of Shared Leadership in Online Communities", CSCW 2012). + +To address the issue of right truncation, we followed the approach of using a fixed period as used by \citet{kittur_herding_2009} and \citet{zhu_effectiveness_2012} and suggested by R2. We used a fixed timespan so that users are followed for 180 days after being added to the study. + +Doing so required collecting and constructing a new dataset from Wikipedia and completing a new set of analyses. Because it is no longer necessary, we dropped our control variable for number of days in the experiment from our regression models. Our results are unchanged. + +\subsection{Justifying study period length} + +% \item \done {\bf Justify timespan of the experimental study} R2: Related to the right-truncation problem is the fact that the paper also fails to discuss and justify what a reasonable timespan for measuring the effect of TWA is, and that it will have an effect on the number of contributions made. It might for instance be that TWA instead has an effect on how long it takes before a user drops out of the system. If we assume that TWA has an effect on contributions, what timespan is needed to measure that effect? The paper assumes that a month is adequate to discover it, whereas one might suspect that it is only measurable over a longer period of time. If it is the case that a short period of time is appropriate (for instance because these users are likely to drop out after a certain amount of time) the paper needs to properly establish that, either by measuring it or referring to previous work. + +R2 encouraged us to, ``discuss and justify what a reasonable timespan for measuring the effect of TWA is.'' After consulting other experimental and design work in Wikipedia, including those suggested by R2, we opted for a 180 days time-period and added text to our manuscript to justify this choice. + +We have added a paragraph to the end of our Methods section that explains how we constructed the window and that addresses the trade-offs associated with choosing an appropriate data collection period. We explain that although a longer data collection period provides more time to observe systematic variance between the treatment and control groups, it introduces concerns that differences too long after the intervention may not be justifiably attributed to the intervention itself. Our 180 day window is as long as any previous field experiment or system deployment in Wikipedia that we have found. For example, \cite{morgan_tea_2013} surveyed users at between 1 and 6 months and \citet{zhu_effectiveness_2012} follows users for 3 weeks including the week of the intervention. + +At the end of our results section, we briefly summarize our result from robustness checks using alternate periods of 360 and 60 days. Our results for 360 days are similar to the results for 180 days. Our results for 60 days show a small negative effect on total number of edits made during the study period. We explain in the paper that one potential explanation for the negative result is that participation in the tutorial may have supplanted other editing among participants in the treatment group but that this effect is ``washed out'' over time. Because we felt \emph{ex ante} that the 180 window was more conservative in that it gave the system more time for differences to emerge, we have reported those results as our core findings. In any case, the pattern of results across the three study lengths is inconsistent -- if not at odds -- with predictions from previous work that suggest that the system would cause new users to edit more. + +\section{Addressing missing literature} + +The R1, R2, and the AC encouraged us to do a better job addressing the literature on learning and newcomers on Wikipedia. We have added several new paragraphs, heavily edited our previous background text, and added many new citations to address these deficiencies in our previous manuscript. + +\subsection{Addressing literature on risk-taking, legitimacy \& learning} + +% \done {\bf Address literature on risk-taking in learning} R1: I would argue that the most important concept from the literature on learning is missing from this discussion, and that’s “legitimacy” (also sometimes referred to in education and learning literature as “authenticity”.) The authors explain that by doing tasks in a pretend version of Wikipedia, they make it a safe space for newcomers to practice, yet performing “canned” tasks in a pretend system is the opposite of offering a legitimate form of participation. [Note: Address by emphasizing/justifying why TWA design was "canned" (see "Why Gamify WP?" and "System Design" sections). Also add to the "Limitations of Gamification" subsection -- connect the "canned" point and authenticity literature there.] + +% We now mention literature on legitimate peripheral participation in the system design section. + +%We address R1's concern about designing the system to have canned tasks by defending that choice in light of Wikipedia's hostile environment to newcomers + +R1 mentioned that we should address the literature on legitimacy in learning environments -- especially in light of our decision to design TWA as a simulation of the editing environment on Wikipedia. R1 was concerned that this literature suggests that users of TWA would view their edits in a simulated editor as inauthentic and that this would lead the game to be ineffective. +In response to R1's concern, we have added two paragraphs discussing \citepos{lave_situated_1991} concept of legitimate peripheral participation in our ``System Design'' section that justifies our choice to provide a simulated editor within TWA for new users to safely practice editing in two ways. +First, we explain that the choice is necessary in light of Wikipedia's hostile environment to newcomers \citep{halfaker_dont_2011}, we chose to create a tutorial where new users could learn how to edit without fear of their work getting reverted by veteran editors. Second, we cite previous work on authenticity in education that suggests that learners can still benefit from inauthentic tasks if they perceive an alignment between their practice tasks and the work of the community of practice they wish to join \citep{guzdial_imagineering_2006, joseph_designing_2003, shaffer_thick_1999}. + +We have also added several sentences to the Discussion subsection on design-related factors that might explain the null effect suggesting that earlier findings on sandboxes and the potential efficacy of ``inauthentic'' learning environments may be incorrect. + +\subsection{Supporting the intuition for TWA's effectiveness} + +R1 and R2 suggested that, given our framing, our results were unsurprising in light of previous research. We have made several additions to the subsection in our background titled, ``Why Gamify Becoming a Wikipedian?'' that points to existing work that explains why, \emph{ex ante}, we could be justified in believing that the system like TWA would increase editing activity among newcomers. + +First, we have added several sentences that refer to previous work that have shown that the a decrease in rates of editor retention is a major problem, and that problem can be connected to changes in newcomers' initial experiences in Wikipedia over time \citep{butler_dont_2008, halfaker_dont_2011, halfaker_rise_2013}. Because these changes in new Wikipedians' initial experiences have been connected to changes in subsequent editing rates of newcomers, we might expect a system like TWA that provides a more friendly and fun initial experience to affect results as well. + +We also added a paragraph that highlights studies by \citet{dejean_big_2015} and \citet{bauer_newcomer_2007} that suggest that tutorials and increased documentation may be particularly promising ways to move the needle and increase newcomer retention in Wikipedia. + +\subsection{Addressing claims that Wikipedians are ``born not made''} + +% \done {\bf Address the "Wikipedians are born not made" argument} R2: If Wikipedians are born, not made, then one could also question whether these contributors are at all going to use TWA. Maybe they ignore TWA because they are already productive and do not need it? Since the paper never makes any references to these papers and discusses issues related to this (e.g. "is the Teahouse more effective since it allows them to get answers when they need help?"), this whole topic area is left hanging. + +% {\bf SN: Added a reference to this in the Why Gamify Being a Wikipedian section. Argued that born not made is an important reason to socialize editors early on} + +% We agree with R2 that wikipedians are born not made should be referenced in the paper. However, we see that argument as supporting our work; panciera et al's paper suggests that power editors typically are those that are engaged immediately and contribute a lot soon after joining. We hope that TWA encourages that behavior. We do not see a conflict between our approach and Panciera et al's work. + +R2 asked us to address the work of \citepos{panciera_wikipedians_2009} and their results that suggest that Wikipedian's are ``born not made'' as one potential reason to believe that the intervention would be ineffective. +We explain that although Panciera et al.~showed that the most active new users can be identified early on, they did not show that this means that interventions will be important or ineffective in changing marginal contribution rates. Simply put, the fact that Wikipedians are ``born'' does not mean that they are not, -- or can not -- also be ``made'' in certain ways. Indeed, we fear that accepting that Wikipedians could never be ``made" (or at least influenced) would undermine any future design research or interventions on Wikipedia or other social computing platforms. + +We have cited and briefly discussed recent work by \citet{huang_how_2015} that shows how the fact that power users in social computing systems are ``born'' does not preclude important changes in patterns of editing activity over time or the ability of system designers to shift or increase contribution rates though interventions. We also refer to a takeaway in \citet{panciera_wikipedians_2009} where the authors interpret their own results as suggesting that researchers should focus on the creation of interventions aimed at newcomers (like ours) which are likely to affect Wikipedian's earliest experiences. + +\section{Highlighting research questions and contributions} + +\subsection{Highlighting research questions} + +% \item \done {\bf Highlight research questions} R2: At the end of the introduction we find the following two questions: "Would a gamified tutorial produce a positive, enjoyable educational experience for new Wikipedians? Would playing the tutorial impact newcomer participation patterns?" These are the paper's research questions! It would be very helpful to the reader if they were displayed more clearly, e.g. as separate items. They should not be hidden. {\bf SN: I used similar emphasis for the research questions (displaying on separate lines, italicizing) as we did for the hypotheses. Not my style, but hopefully it'll make this reviewer happy.} + +The AC and R2 asked us highlight our research questions more explicitly and clearly. We have done this by listing the research questions at the end of the introduction on separate lines and in italics. + +\subsection{Contribution to gamification research} + +% \item \done {\bf Highlight contributions to gamification research} R2: In "Limitations of gamification" the following sentence is found: "...our study is among the first that compares levels of participation in a task among individuals who were introduced to gamified learning first to those that were not." This is an important finding, it shouldn't be hidden back here but instead be up front in the introduction! + +% R3: The other negative results argument in the paper is that these results add to growing evidence of long-term gameificiation failures. I find this argument much more compelling. In other words, by expanding on this argument, the authors may be able to address this question. + +R2 and R3 suggested we do more to highlight our contribution to gamification research. Although we do not see this as a core contribution of our paper, we have addressed this by mentioning at the end of the introduction that we are among the first studies to compare engagement in a task by individuals who were first exposed to a gamified system, to those who were not. + +% \item \done {\bf Clarify that TWA is Wikipedia's first gamified institutionalized socialization effort} R2: "Unlike prior systems, TWA creates a structured experience that guides newcomers through critical pieces of Wikipedia knowledge..." Do we know that there are no other prior systems that offer a similar experience? It might be that there are none within the Wikipedia domain, but what about outside it? That sentence is making a rather bold claim.{\bf SN: We only want to make this claim about Wikipedia, AFAIK - I've clarified the language to reflect that.} + +In our previous manuscript, we said, ``Unlike prior systems, TWA creates a structured experience that guides newcomers through critical pieces of Wikipedia knowledge.'' R2 asked if we intended to claim that our system was the first to created structured experiences in general or simply in the context of Wikipedia. We intended to claim that TWA's approach is unique to Wikipedia and we have clarified our text to reflect this. + +\subsection{Highlighting methodological contributions} + +% \item \done {\bf Highlight awesomeness of using 2SLS (Aaron \& Sneha will write up; Mako will search CSCW DL} R3: As an important side note, if we agree that this [2SLS] framework is the right way to go in the end, the authors should puff their chests more about this by claiming it as a contribution (assuming it hasn’t been used at CSCW before). {\bf SN: I've brought this up in the beginning of the paper, but we'll boast about it more after Mako searches the CSCW DL} + +R2 also suggested we discuss our usage of two-stage least squares (2SLS) regression in Study 2 as a methodological contribution to the social computing literature. Although it is a widely used technique in econometric studies of field experiments, we have searched the ACM Digital Library and Google Scholar and are unable to find any prior research in social computing that uses 2SLS in an invitation-based field experiment. As a result, we have listed this as a contribution of our work in our manuscript's introduction and conclusion. + +\subsection{Clarifying importance of TWA's scalability} + +% \item \done {\bf Clarify importance of TWA's scalability} R2: "...it does not depend on the availability, helpfulness, or intervention of existing Wikipedia editors..." The underlying argument here is that scalability is preferable to personal interaction when socializing newcomers (in peer production communities). Why is that the better solution? As discussed previously, TWA might be designed for contributors who are not going to stick around, why are those the right audience for it? Is the goal to provide everyone with a scalable impersonal introduction, or is it better to provide some (typically based on self-selection) with a personal introduction (e.g. the Teahouse)? {\bf SN: Added text to Why Gamify becoming a wikipedian that addresses this} + +In our manuscript, we highlighted the scalability of TWA as one potential benefit of the system. R2 asked us to explain why we believed that scalability was an important feature saying: + +\begin{quote} + Why is that the better solution? As discussed previously, TWA might be designed for contributors who are not going to stick around, why are those the right audience for it? Is the goal to provide everyone with a scalable impersonal introduction, or is it better to provide some (typically based on self-selection) with a personal introduction (e.g. the Teahouse)? +\end{quote} + +We have added text to the section on, ``Why Gamify Becoming a Wikipedian,'' that explains that we believe that both approaches are potentially valuable. We have highlighted the way that scalable systems like TWA can reach more users, and decrease the burden on the newcomer to initiate their socialization. If effective, scalable systems of socialization could provide effective complements to systems like the Teahouse among a different populations of users. + +% Indeed, the system we used to recruit participants is the same system used by the Teahouse. Like the Teahouse, people who actually take-up the invitation to engage in TWA contribute more than average newcomers because they are more motivated and committed. Most invitees to either system are not likely to stick around and we did not know \emph{ex ante} that the system was not going to be effective. + +\subsection{Reminding readers of main findings at end of introduction} + +% \item \done {\bf Clarify main findings} R2: After reading the introduction, what is the reader expected to remember as the main findings in this paper? At the end of the introduction the following sentence is found: "The study underscores the importance of conducting multiple types of evaluations of social systems." Is that the main contribution? What about the implications for gamified structured introductions to peer production? + +R2 asked us to remind readers of our main findings at the end of our introduction. We have added text to do so. + +\section{Reframing Survey Study} + +% NOTE FROM SN: So I think that the reviewers are right here in thinking that this survey doesn't tell us all that much about how \effective\ TWA is. We don't measure learning outcomes, and truthfully, our measures for things like confidence etc are also kind of shaky since they're based on two questions that are answered on a Likert scale. I actually think the real purpose of Study 1 in this paper is to pre-empt the argument that we have a null result because our system is badly designed. As such, I propose that we reframe the explicit purpose of study 1 as finding out how users react and respond to TWA, instead of calling it a qualitative evaluation of _learning outcomes_ provided by TWA + +The AC, R2 and R3 raise concerns that we do not measure learning outcomes in Study 1. This is a valid critique. We've addressed this concern by reframing Study 1 to emphasize that its purpose was to gather data on participants' perceptions of their experience playing TWA. %We agree that the self-reported measures included here do not measure learning outcomes, and we have . + +The reviewers' comments inspired us to reconsider whether learning outcomes really were an appropriate goal for TWA in the first place. After reviewing the system design goals and revisiting prior literature with this question in mind, we decided to situate this study around questions of perceived enjoyment and experiences related to newcomer retention (such as confidence). + +Fundamentally, Study 1 is about seeing how users in our target population perceive the experience of playing TWA. We include it here to show that users who played TWA broadly liked it, and that TWA's failure to produce a positive (behavioral) effect on user contributions cannot be simply dismissed as a result of poor system design. In general, we have changed the language and variable names throughout the paper to make this point and focus Study 1 around user perceptions. We address related reviewer comments about the survey questions and design in further detail below. + +\subsection{Addressing self-reporting limitations of the survey} + +The AC, R2 and R3 expressed concerns related to the fact that our survey measures were all self-report data and therefore unreliable measures of learning outcomes. We agree that these measures were inappropriate to measure learning outcomes and have addressed this concern by clarifying how and why the study sought to measure users' self-perceptions of their experience playing TWA. We have also shifted the emphasis of the background sections on gamification and gamifying Wikipedia to emphasize that the outcomes of interest in both studies were newcomer engagement and retention (rather than learning). + +% The paper aims to measure "educational effectiveness". Why is a survey the appropriate way to measure that? Based on the description of the survey, it seems that it never asks specific questions to test whether TWA's users learned specific things, in other words whether the education was successful. Later when describing the results the phrase "learning to edit Wikipedia" is used, isn't that the key learning goal of TWA? Yet the survey asks Likert-scale questions. In other words, you're measuring whether TWA users are under the impression that they learned something, not whether they actually did. + +% R3: Excessive import placed on trivial self-report data: It is well-known that self-report data from participants is inferior to observations of actual behavior, and that self-report data can be quite unreliable more generally. As such, in my view, it is not a contribution to show that self-report data didn’t end up panning out in the behavioral results. + +\subsection{Better justifying questions that were included on the survey} + +R2 pointed out several important limitations of the survey design, including questions that were not posed and potential biases of question wording that could have elicited satisficing behavior among respondents. We accept and appreciate these valuable concerns -- they will influence our design of subsequent systems evaluation studies! For the sake of the present paper, we address the issues by acknowledging them as limitations. We have added material to the Discussion that explains the potential for biased responses and missing perspectives. We have also added material to the section detailing the survey design of Study 1 that, we hope, better justifies the content of the questions we did include in the survey instrument. + +%R2: The survey leaves many questions unanswered, some of which the paper might want to address. Were any negative questions asked? Were there any control questions, such as a similar question worded slightly differently to allow for comparison between responses? As it is, this survey comes across as a set of positive statements about TWA that respondents agreed to. Given that respondents self-select and no attempts to contact users who didn't go through TWA appears to have been made, it is likely there is a bias in the responses, and that bias should be discussed. + +% R2: From the Discussion - "The new editors in our study may have had unpleasant experiences during their initial time on Wikipedia..." It appears that the survey asked no questions about this, yet is it not a very important issue related to TWA's success? + +\subsection{Addressing whether TWA boosts confidence of newcomers} + +R2 also raised a key question about whether and how TWA helped raise newcomer confidence. We have addressed this by adding (1) content to the system design section detailing how the system was intended to help increase user confidence and (2) revising the description of the survey design to support the idea that several of the questions measure this important construct. + +%\section{Changing language around references to Wikipedia} + +\section{Specifying the language edition of Wikipedia} +The AC and R2 mention that our previous manuscript implicitly assumed that ``Wikipedia'' refers to the English language version and that we did not refer to other language editions. Thank you for pointing this out! + +To fix this, we added a paragraph in the System Design section that explains why we launched TWA in English Wikipedia and explained that TWA could be adapted for use in other language editions and contexts. + +In sections and sentences that are about specific deployments (i.e. of the system, or of study invitations), we have taken care to mention that we deployed on English Wikipedia. +% However, for concision, we do not change all references to Wikipedia in our paper to specify `English Wikipedia'. + +\section{Removing/explaining Wikipedia jargon} + +R2 pointed out that a general CSCW audience may not recognize terms like wikimarkup and watchlists offhand. We have carefully reread our paper and attempted to remove or explain any Wikipedia-specific jargon throughout the paper. + +% \begin{itemize} +% \item \done {\bf Mention why we chose to study English Wikipedia} R2: ``This paper doesn't discuss any other language editions of Wikipedia besides the English one, and makes the assumption that "Wikipedia" equals the English edition. Adding a mention that Wikipedia exists in multiple languages and explaining why English was chosen as the language where TWA was launched would be very helpful.'' {\bf SN: I added a paragraph in the system design section to address this} + +% \item \done {\bf Remove/explain WP jargon} R2: Your readers might not now what watchlists are, take care to write for a general audience, not everyone knows a lot about how Wikipedia works behind the scenes. {\bf SN: Clarified the purpose of the watchlist mission, added a couple of footnotes to give one sentence explanations of things like wikimarkup and Featured Articles} +% \end{itemize} + +% \section{Study 2 clarifications} +% R2 and R3 requested a number of clarifications on our field experiment study design and analysis. + +\section{Account selection criteria in Study 2} + +R2 suggested that we clarify the account selection criteria used in Study 2. Specifically, they note: + +\begin{quote} +...in the second paragraph of "Methods" it describes the selection criteria, that at least one contribution would have to be made after getting invited. This would perhaps be much less confusing if the criteria were first explained, particularly how the experiment and control groups were set up, and then how many accounts were identified. +\end{quote} + +We have reorganized the Study 2 Methods section in exactly this manner and agree it is more clear. +Thank you for this suggestion. + +\section{Defining ``date of inclusion'' in Study 2} + +R2 also requested clarification on what we meant by ``date of inclusion'' in the study. The date of inclusion for a user in the treatment group was the date that invitation to play TWA was sent. For users in our control group, the date of inclusion was the date the invitation \emph{would have been sent} had the user been in treatment instead. We have added detail to the Study 2 Methods and Measures subsections to make this clear. + +\section{Difference in size of study population between studies} + +R2 asked us to explain the difference in size between the number of invitees in Study 1 and 2. We have explained that Study 2 had one inclusion criterion in addition to the criteria used for Study 1: editors needed to make at least one edit to Wikipedia \emph{after} their date of inclusion in the study. We added this criteria to ensure that users had a chance to see their invitation to TWA. +% This led to a smaller number of participants in Study 2 than in Study 1. +We have clarified this in the Methods section of Study 2. + +\section{Interpretation of 2SLS results} + +R3 wanted us to clarify that we had used the correct analytic framework. In particular, they said: + +\begin{quote} + (1) The goal here is to introduce a treatment that ultimately will + produce strong new members of the Wikipedia community at a higher rate than the control. + + (2) Let’s say the game produces 3 such members out of 100 new editors and the control produces 1, which looks like it might be the case. Let’s also say that this pattern additionally persists over a large n. + + (3) If this is true, why do we care about the potentially moderating effect of the invitations? +\end{quote} + +The issue here is that the ``treatment'' in our study is not the tutorial, it is the \emph{invitation} to play the tutorial. In our field experiment, we randomly sorted qualifying users into treatment and control groups, and sent invitations to play TWA to those in treatment, but not to those in control. + +Although a subset of the treatment group self-selected into playing TWA, these users cannot be directly compared to the whole control group because the control group likely contains users who would have ignored the invitation even if they had been sent one. Since randomization happened at the invitation stage, we can only examine the effect of the game \emph{conditional on being invited}. This is why we use two-stage least squares regression. This is a well known threat and a common approach in social scientific field experiments. + +We recognize that this explanation needed to be clearer in our paper -- especially since we are using a method that, although common in economics and political science, is not yet commonly used by social computing researchers. To this end, we have significantly rewritten the Analytic Approach section to provide a clearer and more detailed explanation of our experiment design and methods. We have also cited additional econometrics and field experiment design textbooks that explain both the threat and the appropriateness of our analytic design. + +\section{Issues raised about content persistence} + +R2 raised a number of questions about our content persistent metric. We have added several citations to the work on content persistence in Wikipedia including \citet{priedhorsky_creating_2007}, \citet{ekstrand_rv_2009}, \citet{adler_content-driven_2007} and \citet{adler_assigning_2008} to help clarify our methods and put it in context of previous work developing and using these measures. Although it was not an issue raised by any reviewers, we have also clarified that we apply content persistence measure only to edits to pages in the article namespace. + +\subsection{Choice of edit radius size} + +% \item \done {\bf Justify six-edit radius for edit persistence metric (Mako and Aaron)} R2: Why is a six-edit radius chosen when measuring word persistence? Halfaker et al. make no claim about what the radius should be in the referenced work, and Ekstrand et al suggest a 15 edit radius in a related paper (Ekstrand and Riedl "rv you're dumb: identifying discarded work in Wiki article history." WikiSym 2009) The six-edit radius also comes with an issue that is unadressed: how long does it take for an edit made by a contributor in the study to reach that six-edit radius? If it hasn't been reached at the end of the study period, that edit has to be discarded as its quality is unknown. In a related paper, Farzan and Kraut instead chose to use percentage of words that survived as a measure of quality (Farzan and Kraut "Wikipedia classroom experiment: bidirectional benefits of students' engagement in online production communities" CHI 2013) + +R2 raised questions about our choice of edit radius for calculating content persistence. We have explained that we chose radius of 6 because this is what was used by Adler et al.~in \emph{WikiTrust} -- the most widely used and systematically validated implementation of content persistence for Wikipedia contributions. We have clarified that we have tried to implement the WikiTrust model in one other way as well by ``collapsing'' subsequent edits by the same user into a single edit session. We have also added a justification of our selection of edit radius and explained that different researchers have used different radii. + +\subsection{Threats to validity from short edit radii} + +We have also explained that, as R2 suggests, our measure will underestimate the productivity of users who edit very infrequently edited articles. Indeed, many of the articles edited by users in our study did were not the subject of six subsequent sessions and, as a result, have lower quality scores at the point that we collected these data. Indeed, in the dataset we have collected, we find that the mean number of subsequent edit sessions is only 2.6. + +Of course, this will be a threat to our results if we believe that users in the treatment group are systematically more likely to improve Wikipedia than the control group, but that their relatively high levels of activity are concentrated in the small number of articles with few subsequent edits. We compared the mean radius length for edits in our treatment and control groups and found no statistically significant difference. We have mentioned this fact in our Methods section. + +\subsection{Alternative measures of text persistence} + +R2 suggested that we might adopt the approach used by \citet{farzan_wikipedia_2013} which uses percentage of words maintained over shorter radii. Although this approach has advantages, it introduces new trade-offs. For example, it can treat low-quality edits as high quality when they are made to poorly maintained articles. Because we collected subsequent edit data over a relatively long period of time, we opted to go with the more widely used and validated metric used by WikiTrust and advocated by Halfaker. + + +% \begin{itemize} + +% \item \done {\bf Clarify account selection criteria} R2: The description of how accounts were selected to be included is rather confusing. First it describes 1,967 accounts that met the same criteria as for the user survey, however 10,000 individuals ("accounts"?) were invited to the beta. Why is one an order of magnitude larger than the other? Then in the second paragraph of "Methods" it describes the selection criteria, that at least one contribution would have to be made after getting invited. This would perhaps be much less confusing if the criteria were first explained, particularly how the experiment and control groups were set up, and then how many accounts were identified. {\bf SN: Followed this suggestion, and switched the order} + +% \item \done {\bf Clarify criteria for measuring edits before and after inclusion in the study} R2: "we measure the overall contributions as the total number of edits made by each account from the time of inclusion in the study until May 31, 2014." When exactly is "time of inclusion", is that when they got the invite? What about when they completed one (or all) TWA mission(s)? The concern here is that all contributions are measured, whereas the experiment sets up a pre/post-scenario. Later on the paper refers to "subsequent contributions", indicating that contributions after a certain point in time was measured. This quickly becomes rather confusing, spelling out clearly what points in a user's account history is used (e.g. "we measure contributions at four points in time: when the user registered their account, the time of invitation, when they first started using TWA, and the end of the experiment") would be very helpful. {\bf SN: Added clarifying language in the Study 2 Methods and Measures sections} + +% \item \done {\bf Clarify interpretation of 2SLS results (Aaron \& Sneha)} R3: The authors argue that new editors that responded to the invitation to play the game might just be new editors who are engaged and, critically, would have been power editors whether or not the game existed. However, barring a random fluke, shouldn’t these future power editors also have been in the control group? If I’m right here, I’m thinking the invitation doesn’t matter and a more traditional statistical analysis (or at least one targeted at identifying rare events) is appropriate. +% I could be wrong, but I want the authors to respond to this question, both through feedback to reviewers and clarifications in the paper. {\bf [Note: The paper isn't clear that the randomization happens at the invitation stage and that's why ITT is appropriate. Change this!]} +% \end{itemize} + +\section{Expanding Discussion} + +% \done {\bf Address the fact that we aren't forcing people to play the game before editing} R3: Does the game work if folks are forced to play it prior to editing Wikipedia, as would be the case in most other institutionalized socialization contexts? This is not just a hypothetical: this game could be used in all Wikipedia Education Project classes and related endeavors. {\bf SN: added a paragraph in the self-selection section of the discussion that suggests this as future work} + +\subsection{Addressing paths to positive results} + +R3 asks us to to address whether we might see different results if users were required to play TWA before editing Wikipedia. Specifically, they ask: + +\begin{quote} + Does the game work if folks are forced to play it prior to editing Wikipedia, as would be the case in most other institutionalized socialization contexts? This is not just a hypothetical: this game could be used in all Wikipedia Education Project classes and related endeavors. +\end{quote} + +We constructed our field experiment to mimic how TWA would likely be deployed to newcomers on English Wikipedia. Since Wikipedia has historically sought to accommodate users who wish to contribute to the encyclopedia quickly and easily, forcing new users to play the game before being able to edit would go against core Wikipedia principles. Self-selection into the tutorial was always part of its design, and we evaluated its effect accordingly. + +However, we acknowledge in the discussion section that the dynamics of self-selection into the tutorial might be one factor contributing to the null effect. It is important to note that when we say that the system had no aggregate effect on newcomer contribution in our field experiment, we do not mean that the system is completely bad or useless. + +Indeed, we are proud of many aspects of the system design, and the positive feedback we have received (and continue to receive) from those who have played TWA suggest that there are use cases worth exploring beyond invitation-based deployments to new users on Wikipedia. Seeing if the tutorial has an effect in settings where you can ensure that participants have played the game (such as classroom use, or lab studies) would be valuable future work in this space. We have added text to the Discussion section accordingly. + +\subsection{Argument in favor of negative results} + +% \item \done {\bf Stronger defense of why negative results are important} R3: As noted above, this paper is a negative results paper at its core, and we’ll have to have a broad discussion about this at the PC meeting, assuming the paper makes it this far. In the event that this occurs, can the authors provide a more robust argument as to why these negative results are important for other researchers and practitioners? + +The AC and R3 encouraged us to add text our paper to add, ``a more robust argument as to why [our] negative results are important for other researchers and practitioners.'' We have added a paragraph to our conclusion to address this. + +We argue that although null results can be difficult to convincingly establish and interpret, they play a important role in validating our knowledge of social computing theory and systems. Null results can often be attributed to small sample sizes, but our sample is relatively large, suggesting that the effect is well-estimated. Although null results are often difficult to explain, they are important to document. It would be dangerous for social computing researchers to only highlight positive results as evidence that theories hold while dismissing every negative result as the result of a botched experiments or poor design. + +Our null result is meaningful in that the design of our system was informed by previous empirical, theoretical, and design work, and that the system performed well using the survey-style metrics used to evaluate most similar social computing systems. Like any study, ours does not provide the final word on institutionalized socialization in peer production. We believe it contributes to our understanding of the topic. + +In terms of R3's general question about what makes a strong negative result, the best answer is put forward in the second-to-last paragraph of our Study 2 Results section in the modified manuscript. Our parameter estimates represent well-estimated zeroes that are likely not just due to a small sample size. Post-hoc power analysis shows that if a data set of this size displayed even a small effect size (0.2 standard deviations), we would have had a 99\% chance of detecting it at the 0.05 significance level. + +\section{Improving clarity of Study 1 bar charts} + +The AC and R2 suggested we regenerate the Study 1 bar charts to display the results as percentages, rather than counts, for ease of comparison. We have done so, and have updated Figures 4 and 5 accordingly. + +\section{Clarifying and defining terms used in TWA} + +\subsection{Distinguishing between `basic' and `advanced' editing} +R2 took issue with a distinction we made between basic and advanced editing on Wikipedia made in TWA. Specifically, they noted: + +\begin{quote} + ``In "Missions" a distinction is made between "basic" and "advanced" editing techniques. It appears to be somewhat arbitrary, why is adding sources advanced editing, but watchlists are not?" +\end{quote} + +We agreed that this was an arbitrary distinction, and an unimportant one. We changed the text in the Missions subsection to reflect this. Adding sources and formatting sections are now merely `additional' techniques rather than advanced ones. + +\subsection{Clarifying `reaching' a mission} +R2 expressed confusion with our description of attrition in Table 3, and thought it conflicted with the text in the Study 2 Results subsection. + +\begin{quote} + ``It is confusing how 181 users reached the final mission but did not complete it, yet in the text it seems these 181 users actually did.'' +\end{quote} + +We have updated the text in the Study 2 Results subsection, as well as the caption in Table 2, to describe our attrition metric more carefully. We also now refrain from saying that 181 users `completed' the game, because our metric aggregates all users who played some part of the highest level they reached. Indeed, it is possible that many or all of the 181 users who reached mission 7 did complete it, but since we do not distinguish between users who played some or all of mission 7, we changed the language in the text to be more precise. + +\subsection{Distinguishing between beta and non-beta versions of TWA} + +% \item \done {\bf Distinguish more clearly between beta and non-beta versions of TWA} R2: "This is a larger proportion of users than took up the invitation in Study 1, which may be due to changes in the invitation text." Earlier in the paper study 1 refers to a "beta", whereas this appears to be not. If this is the case, this is an important difference between the two that should be made clear to the reader. {\bf SN: Noted in the Methods section of Study 2 that the system was no longer in beta.} + +R2 noted that the version of TWA evaluated in Study 1 was in beta, and the version evaluated in Study 2 was not. They asked us to make this distinction between these two versions more clear. To address this, we are more explicit in the introduction to Study 2 that TWA was no longer in beta at this time. No major changes were made to the system after its beta release, so the fact that the system was no longer in beta during Study 2 is unlikely to affect any results. +% TODO: List any changes between beta and non-beta versions + + +\section{Stylistic changes} + +The AC and R2 asked us for a number of minor stylistic changes and clarifications in the text. + +\subsection{Cleaning up citation orders} + +% \item \done {\bf Clean up citation orders} R2: When citing multiple papers it is preferable that they are in order, e.g "[14, 23, 17]" should be "[14, 17, 23]" (page 1). {\bf SN: We should check for this again after finishing all edits} + +R2 requested that citations to multiple papers be placed in order that they appear in our references section (e.g ``[14, 23, 17]'' should be ``[14, 17, 23]" ). We have now taken care to arrange our citations in this way. + +\subsection{Moving tables for clarity} + +% \todo {\bf Move tables for clarity} R2: * Tables 1, 2, 3, and 4, as well as figure 6 should be brought closer together so it's easier to follow along. Table 1 occurs before the text that refers to it, and table 4 is two pages further along. Putting all tables and figure 6 on the same page might be a good solution. + +As requested by R2, we have attempted to move tables in our LaTeX source files so that they are displayed more closely together. + +\section{Add citation for newcomers and vandalism} + +%\item \done {\bf Cite more thoroughly} R2: An overall issue here is that there are few citations to sources. For instance a claim is made that "newly created accounts are the primary source of spam and vandalism on Wikipedia". Consider a "[citation needed]" added after that. + +R2 requested that we add a citation for our claim that newly created accounts are a primary source of vandalism. We have added a citation to \citet{potthast_automatic_2008} that suggests that the number of edits per user is an excellent predictor of vandalism in Wikipedia. + +\section{Improved example of institutionalized welcoming} + +Although it was not requested by any reviewer, we replaced one example of institutionalized elements that existed in Wikipedia prior to our intervention (WelcomerBot) with another (Welcoming Committee) in the section ``Why gamify becoming a Wikipedian?'' + +WelcomerBot was only active for a period of 4 years and was a fairly small-scale intervention. The Welcoming Committee has been continuously active for much of Wikipedia's history, and has facilitated the delivery of welcome messages to many more new editors. + + +\section{Additional analyses} +% R2: It might for instance be that TWA instead has an effect on how long it takes before a user drops out of the system. + +% R3: Another related outcome variable that might be useful to analyze is how +% long the new editors in each group remained active editors in the +% community (i.e. survival analysis). Because the data is quite old, this +% should be an easy new analysis to run, and longevity has been a variable +% interest in a number of peer production studies. + +% In their second draft and the feedback to reviewers, I would like to see +% the authors discuss either new analyses related to power users or why thy +% did not consider this outcome variable. I would also like to see the same +% for survival analysis. + +The AC, R2 and R3 brought up the possibility of examining other outcome variables, such as longevity of activity (i.e. survival analysis) and chance of producing a power editor. Although we explored the possibility of doing so at some length, we did not add either. + +\subsection{Modeling user longevity} + +Both R2 and R3 raised questions about adding a analysis of user longevity and asking if users, although not more contributing more within their first six months, might contribute for a longer period of time. The reviewers suggested that survival models (e.g., Cox proportional hazard models) might be an appropriate analytic approach. Although we agree that this is a compelling measure of newcomer retention, we opted not to include this additional analysis for three reasons. + +First, there are important concerns with survival models in social computing systems because editors can never truly be declared gone in that they can return after any point. Second, although relatively rare in Wikipedia research, survival models of Wikipedia activity by \citet{zhang_how_2012} suggest that the large majority of user activity happen within our 180 day window. Finally and most critically, we simply felt that explaining the additional methodology would involve considerably lengthening our already long paper. + +As a result, we believe that this would be better addressed in future work, and have included some text in the discussion section suggesting it as such. + +\subsection{Modeling highly-skewed count outcomes} + +% Aaron working on this now. +% mention rosenbaum citation +% mention justification for negbin models and mann-whitney +% mention that we added longer observations +R3 raises concerns about whether our analysis can identify and precisely estimate rare effects: + +\begin{quote} +The authors seem focused on the average effects across the entire control and treatment groups (the two treatment groups, to be specific). However, would it not also be reasonable to consider the metric I describe above: the \% of new editors that go on to be power editors? Since power editors end up contributing most of the edits anyway *over the long term*, to me this seems like the way to go (i.e. if this group of editors were followed for years, statistically significant differences would begin to emerge). If the authors agree, the authors need to reanalyze their data with this metric in mind. +\end{quote} + +We have already addressed the concerns about extending the temporal coverage of the analysis. On the question about identifying rare effects, we agree with R3 that this is an important consideration and address it in several ways. First, we have clarified our justification of the analytical techniques we use to underscore that the approaches we employ (negative binomial regression and Mann-Whitney tests) are the most appropriate, proven methods to identify precisely the sort of effects that R3 is suggesting may have occurred. In essence, the issue here relates to the type of outcome variable we are modeling --- a highly-skewed count. Our methods of analysis are the best conventional parametric and non-parametric techniques for analyzing these kinds of variables and have been used in prior field experiments conducted on Wikipedia editors, such as \citet{restivo_no_2014}. We have added citations and text to support this in the ``Analytical Approach'' subsection of Study 2. We have also incorporated a note in the Discussion section of Sudy 2 about extending this study with more novel, relatively unknown methods for detecting attributable effects described in \citet{rosenbaum_design_2010}. Doing so lies beyond the scope of this paper. + + +\bibliographystyle{chicago} +\bibliography{refs-processed} + +%The paper presents the design and evaluation of a gamified tool for socializing and retaining new Wikipedia editors. The study found that users liked—but did not learn from—the system. + +%The focus on improving the experience of newcomers in Wikipedia is relevant and important. Reviewers describe the study as well motivated and exceptionally well-written. Read R3’s comments on the writing quality and congratulate yourself! + +% The reviewers, however, have many concerns about the paper—each +% focusing on a different aspect of the work. The concerns the reviewers +% note /may/ be addressable during the revise and resubmit period, but it +% will be an exceptionally herculean effort. Also, please keep in mind that +% there is no guarantee of acceptance even after making changes. So, it is +% at the authors’ discretion about whether or not to proceed with +% revisions or withdraw the paper. + +% There is split amongst the reviewers as to whether the failure of the +% tool is interesting or not. R1 raises concerns that the failure of the +% tool could be predicted from existing literature, suggesting little +% rationale for doing the work in the first place. R2 asks whether there +% is something fundamentally different about people who continue to +% contribute to Wikipedia, and as such whether the system holds value in +% practice. R3, on the other hand, sees much value in the systems +% contribution of the work as well as the real-world evaluation. R3's +% review has some suggestions of alternative framings that may make the +% contribution more valuable. + +% In treatment of related work, many improvements are needed. R1 notes that +% the discussion of the well-known concept of legitimacy/authenticity in +% learning environments is missing. R2 also points to missing literature +% about Wikipedian experience. + +% R2 and R3 raise a number of methodological questions about the paper. R2 +% suggests the distribution of participants across the timeline may bias +% the results. R3, on the other hand, sees opportunity here, suggesting +% additional statistical analysis related to longevity and power users. +% Both R2 and R3 question the methodological choice and contribution of +% measuring perceptions of learning rather than actual learning. Overall, +% this points to a need for at the very least justifying the methodological +% choices and at the most carrying out additional statistical analyses. + +% In summary, there is quite a bit of work to be done. I wish the authors +% the best of luck, should they choose to continue in the review process. + + +% Requested Revisions + +% REQUIRED: +% - Provide justification for why the study was worth carrying out, in +% response to R1 and R2’s concerns. R3’s review may have some insight +% into alternative framings. +% - State the research questions more explicitly, as per R2’s +% recommendation +% - Address R1 and R2’s concerns about missing literature +% - Ensure that the narrative around Wikipedia is clear to readers who do +% not have an in-depth background in production/editing details. +% - Improve the clarity of the results by using percentages or another +% baseline that allows comparison between numbers, as per R2’s review. +% - Provide justification for measuring perceptions of learning versus +% actual learning. +% - Provide a robust discussion of why the results are meaningful for +% researchers and/or practitioners. + +% OPTIONAL, RECOMMENDED +% - Consider carrying out additional statistical analyses as recommended by +% R3. +% - Provide a short justification for use of English language Wikipedia, as +% per R2’s review. + +% Formatting and Reference Issues + + + +% ------------------------ Submission 516, Review 1 ------------------------ + +% Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial for New Users + + +% Expertise + +% 4 (Expert) + +% First Round Overall Recommendation + +% 3 (Maybe acceptable (with significant modifications)) + +% Contribution and Criteria for Evaluation + +% In this paper, the authors present the design and two-pronged evaluation +% of a tutorial for new wikipedia editors that uses elements of +% gamification like missions and badges to help coach new editors and help +% them learn best practices and social norms of Wikipedia. The outcome is +% that users like the system but, based on behavioral measures, they don't +% actually learn from it. Learning interventions are a classic kind of +% research problem, and the paper should include robust measures of +% learning, as well as a good description of the designed intervention +% itself, why the design is expected to lead to learning, and a clear +% description of the study. + +% Assessment of the Paper + +% This is a reasonably well motivated study with connections to appropriate +% literature and the writing is engaging and understandable. The problem of +% enculturating newcomers into projects like Wikipedia is well documented +% and this paper investigates a potential intervention with an admirably +% well-planned study. Designing learning interventions is really difficult +% and I commend the authors on a well-executed effort. + +% Still, I am ambivalent about the paper because I would have predicted +% these outcomes based on the literature alone. In the discussion, the +% authors note that one mismatch between Wikipedia and the tutorial as +% designed involve the “gradual peripheral participation” of newcomers +% as they take on the identity of “Wikipedian.” They suggest that maybe +% speeding up this process is unnatural. I would argue that the most +% important concept from the literature on learning is missing from this +% discussion, and that’s “legitimacy” (also sometimes referred to in +% education and learning literature as “authenticity”.) The authors +% explain that by doing tasks in a pretend version of Wikipedia, they make +% it a safe space for newcomers to practice, yet performing “canned” +% tasks in a pretend system is the opposite of offering a legitimate form +% of participation. I immediately wonder, why not use what we know from the +% literature to create low-risk missions that newcomers can complete while +% legitimately contributing to the encyclopedia? Risk taking is a +% fundamental characteristic of games that makes them engaging; it +% certainly seems like it would play a role in people’s motivation in a +% scenario like this. Rather than eliminating risk, the literature on +% legitimate peripheral participation would suggest that finding the right +% degree of risk is required to facilitate progressive entree into a set of +% shared practices. + +% I am disappointed by the missed opportunity here, the outcome mainly +% seems to verify that what we know shouldn’t work based on the +% literature in fact doesn’t work. Yet still the paper isn’t bad and +% the study is carefully crafted and reported. + +% With some extension and reflection, I think the discussion could help +% point future research in a more fruitful direction. There are millions of +% pages written on the challenges of designing learning interventions that +% change people’s behavior, this paper ends on a painfully obvious note. +% It’s true that usability isn’t all it takes, but what can we learn +% from TWA adventure about the design of systems to facilitate +% enculturation into a community of practice? What can we take away from +% this that might inform more successful tutorial systems in the future? + +% Formatting and Reference Issues + + + +% ------------------------ Submission 516, Review 2 ------------------------ + +% Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial for New Users + + +% Expertise + +% 4 (Expert) + +% First Round Overall Recommendation + +% 2 (Probably NOT acceptable) + +% Contribution and Criteria for Evaluation + +% This paper's contribution is the design and evaluation of a structured +% introduction to a peer production community (English Wikipedia) called +% "The Wikipedia Adventure". TWA's design is rooted in theories of +% gamification, and its utility is evaluated through a user survey and an +% invitation-based field experiment. The paper reports on the survey +% respondents' satisfaction with TWA, and how their experiment results +% reveal some of the challenges of affecting lasting changes to contributor +% patterns in peer production communities. These findings are then +% discussed in relation to cultural factors in Wikipedia, issuses of +% self-selection and voluntary participation, and the limitations of +% gamification. + +% When evaluating a paper that describes the design of a system, the two +% main criteria are that the system and/or its development setting is/are +% novel, and that the way the system is evaluated is methodologically +% sound. + +% Assessment of the Paper + +% As mentioned in the contribution section, this paper's contribution is +% the design and evaluation of a structured introduction to a peer +% production community based on gamification, called "The Wikipedia +% Adventure". This is a great idea and sounds like a useful addition to +% Wikipedia. The paper is written in a way that makes it easy to read, and +% provides the reader with a good introduction to how TWA's design is +% rooted in theories of gamification, thus applying these principles in +% what appears to be a novel setting. The paper also does a good job of +% discussing the findings, organizing them in a way that is easy to follow +% and touching on important points (e.g. cultural factors, and the +% limitations of self-selection and gamification). + +% The overall ideas and approach taken in this paper are sound, they are in +% line with the criteria described previously. Unfortunately, there are +% two major issues and several minor ones that need to be resolved before +% this paper is ready for publication. The first major issue is that the +% methodology used to evaluate performance in the invitation-based +% experiment measures contribution in a skewed manner and does not +% establish why that is appropriate. Secondly, the paper fails to consider +% arguments put forth by Panciera et al's "Wikipedians are Born, Not Made" +% paper. This review will expand on both of these major issues below. +% Further below will be notes and comments with suggestions for improvement +% for specific sections of the paper, some of which are rather substantial +% as well. + +% 1: Evaluating TWA effectiveness by number of contributions +% ---------------------------------------------------------- + +% A major part of the paper is the evaluation of TWA's effect on subsequent +% contributions. To evaluate this an invitation-based field experiment is +% used, and the paper does a great job of justifying why that is +% appropriate in this setting. The experiment runs from February 2014 and +% three months forward. Exact dates are not given, so let us assume that +% it ran until the end of April 2014. User contributions are then measured +% until the end of May 2014. + +% There are two problems with this approach that the paper fails to address +% properly. One is the issue of right-truncation found in the data. +% Contributors who joined in early February 2014 would have about four +% months to make edits, whereas those who joined in late April would only +% have about a month. The model does contain a control variable for number +% of days in the experiment, but why is that appropriate in this context? +% If we examine other work in the same domain, they tend to either use a +% much longer time period (e.g. the Teahouse paper, citation 23, which uses +% 6-9 months) or ensure that the time period is fixed (e.g. Kittur et al. +% "Herding the Cats: The Influence of Groups in Coordinating Peer +% Production", WikiSym 2009; or Zhu et al. "Effectiveness of Shared +% Leadership in Online Communities", CSCW 2012). + +% Related to the right-truncation problem is the fact that the paper also +% fails to discuss and justify what a reasonable timespan for measuring the +% effect of TWA is, and that it will have an effect on the number of +% contributions made. It might for instance be that TWA instead has an +% effect on how long it takes before a user drops out of the system. If we +% assume that TWA has an effect on contributions, what timespan is needed +% to measure that effect? The paper assumes that a month is adequate to +% discover it, whereas one might suspect that it is only measurable over a +% longer period of time. If it is the case that a short period of time is +% appropriate (for instance because these users are likely to drop out +% after a certain amount of time) the paper needs to properly establish +% that, either by measuring it or referring to previous work. + +% 2: Wikipedians Are Born, Not Made +% --------------------------------- + +% In their GROUP 2009 paper "Wikipedians Are Born, Not Made: A Study of +% Power Editors on Wikipedia", Panciera et al. show data that argues that +% those contributors who are going to stick around behave in a way that is +% different from the very beginning. In followup work published in 2010 +% they find similar differences in another peer production community. +% (Panciera et al. "Lurking? cyclopaths?: a quantitative lifecycle analysis +% of user behavior in a geowiki." CHI 2010) + +% These two papers and the argument they put forth are relevant because +% they question who TWA is designed for. In the related work a reference to +% Bryant et al's "Becoming Wikipedian" is made, thereby suggesting that TWA +% is designed to teach someone how to be a Wikipedian. As Panciera et al's +% paper argues along the lines of these contributors already being +% Wikipedians, should TWA be designed to instead help these contributors +% stay productive? + +% If Wikipedians are born, not made, then one could also question whether +% these contributors are at all going to use TWA. Maybe they ignore TWA +% because they are already productive and do not need it? Since the paper +% never makes any references to these papers and discusses issues related +% to this (e.g. "is the Teahouse more effective since it allows them to get +% answers when they need help?"), this whole topic area is left hanging. + +% --- +% Below follows comments/notes for each section of the paper. + +% Introduction: +% * An overall issue here is that there are few citations to sources. For +% instance a claim is made that "newly created accounts are the primary +% source of spam and vandalism on Wikipedia". Consider a "[citation +% needed]" added after that. +% * When citing multiple papers it is preferable that they are in order, +% e.g "[14, 23, 17]" should be "[14, 17, 23]" (page 1). This minor issue +% also occurs elsewhere in the paper. +% * "Unlike prior systems, TWA creates a structured experience that guides +% newcomers through critical pieces of Wikipedia knowledge..." Do we know +% that there are no other prior systems that offer a similar experience? It +% might be that there are none within the Wikipedia domain, but what about +% outside it? That sentence is making a rather bold claim. +% * After reading the introduction, what is the reader expected to remember +% as the main findings in this paper? At the end of the introduction the +% following sentence is found: "The study underscores the importance of +% conducting multiple types of evaluations of social systems." Is that the +% main contribution? What about the implications for gamified structured +% introductions to peer production? + +% Background: +% * "...women reported that they found that contributing to Wikipedia +% involved a high level of conflict and that they lacked confidence in +% their expertise [8]. This suggests that more effective onboarding tools +% could help incorporate newcomers." This is an important side of +% Wikipedia, but how does TWA's design help mitigate this issue? Are there +% design elements in TWA that aims to boost confidence in one's expertise? +% * At the end of the introduction we find the following two questions: +% "Would a gamified tutorial produce a positive, enjoyable educational +% experience for new Wikipedians? Would playing the tutorial impact +% newcomer participation patterns?" These are the paper's _research +% questions_! It would be very helpful to the reader if they were displayed +% more clearly, e.g. as separate items. They should not be hidden. + +% System Design: +% * "...it does not depend on the availability, helpfulness, or +% intervention of existing Wikipedia editors..." The underlying argument +% here is that scalability is preferable to personal interaction when +% socializing newcomers (in peer production communities). Why is that the +% better solution? As discussed previously, TWA might be designed for +% contributors who are not going to stick around, why are those the right +% audience for it? Is the goal to provide _everyone_ with a scalable +% impersonal introduction, or is it better to provide _some_ (typically +% based on self-selection) with a personal introduction (e.g. the +% Teahouse)? + +% Game-like elements (subsection of System Design): +% * In "Missions" a distinction is made between "basic" and "advanced" +% editing techniques. It appears to be somewhat arbitrary, why is adding +% sources advanced editing, but watchlists are not? +% * Your readers might not now what watchlists are, take care to write for +% a general audience, not everyone knows a lot about how Wikipedia works +% behind the scenes. + +% Study 1: User Survey: +% * This paper doesn't discuss any other language editions of Wikipedia +% besides the English one, and makes the assumption that "Wikipedia" equals +% the English edition. Adding a mention that Wikipedia exists in multiple +% languages and explaining why English was chosen as the language where +% TWA was launched would be very helpful. +% * The paper aims to measure "educational effectiveness". Why is a survey +% the appropriate way to measure that? Based on the description of the +% survey, it seems that it never asks specific questions to test whether +% TWA's users learned specific things, in other words whether the education +% was successful. Later when describing the results the phrase "learning to +% edit Wikipedia" is used, isn't that the _key_ learning goal of TWA? Yet +% the survey asks Likert-scale questions. In other words, you're measuring +% whether TWA users are under the impression that they learned something, +% not whether they actually did. +% * Figure 4 uses counts. While it shows that none of the questions had +% responses from all participants, it makes comparisons between questions +% with different response rates very difficult. Using percentages would +% allow for direct comparisons, and makes the references to the figure in +% the text easier to follow along with. The text refers to four questions +% with a certain percentage of responses, but leaves the math to the +% reader. +% * The survey leaves many questions unanswered, some of which the paper +% might want to address. Were any negative questions asked? Were there any +% control questions, such as a similar question worded slightly differently +% to allow for comparison between responses? As it is, this survey comes +% across as a set of positive statements about TWA that respondents agreed +% to. Given that respondents self-select and no attempts to contact users +% who didn't go through TWA appears to have been made, it is likely there +% is a bias in the responses, and that bias should be discussed. + +% Study 2: Field Experiment: +% * The description of how accounts were selected to be included is rather +% confusing. First it describes 1,967 accounts that met the same criteria +% as for the user survey, however 10,000 individuals ("accounts"?) were +% invited to the beta. Why is one an order of magnitude larger than the +% other? Then in the second paragraph of "Methods" it describes the +% selection criteria, that at least one contribution would have to be made +% after getting invited. This would perhaps be much less confusing if the +% criteria were first explained, particularly how the experiment and +% control groups were set up, and then how many accounts were identified. +% * "This is a larger proportion of users than took up the invitation in +% Study 1, which may be due to changes in the invitation text." Earlier in +% the paper study 1 refers to a "beta", whereas this appears to be not. If +% this is the case, this is an important difference between the two that +% should be made clear to the reader. +% * "we measure the overall contributions as the total number of edits made +% by each account from the time of inclusion in the study until May 31, +% 2014." When exactly is "time of inclusion", is that when they got the +% invite? What about when they completed one (or all) TWA mission(s)? The +% concern here is that all contributions are measured, whereas the +% experiment sets up a pre/post-scenario. Later on the paper refers to +% "subsequent contributions", indicating that contributions after a certain +% point in time was measured. This quickly becomes rather confusing, +% spelling out clearly what points in a user's account history is used +% (e.g. "we measure contributions at four points in time: when the user +% registered their account, the time of invitation, when they first started +% using TWA, and the end of the experiment") would be very helpful. +% * Why is a six-edit radius chosen when measuring word persistence? +% Halfaker et al. make no claim about what the radius should be in the +% referenced work, and Ekstrand et al suggest a 15 edit radius in a related +% paper (Ekstrand and Riedl "rv you're dumb: identifying discarded work in +% Wiki article history." WikiSym 2009) The six-edit radius also comes with +% an issue that is unadressed: how long does it take for an edit made by a +% contributor in the study to reach that six-edit radius? If it hasn't been +% reached at the end of the study period, that edit has to be discarded as +% its quality is unknown. In a related paper, Farzan and Kraut instead +% chose to use percentage of words that survived as a measure of quality +% (Farzan and Kraut "Wikipedia classroom experiment: bidirectional benefits +% of students' engagement in online production communities" CHI 2013) +% * Tables 1, 2, 3, and 4, as well as figure 6 should be brought closer +% together so it's easier to follow along. Table 1 occurs before the text +% that refers to it, and table 4 is two pages further along. Putting all +% tables and figure 6 on the same page might be a good solution. +% * Table 3 refers to users "reached" a mission. It is confusing how 181 +% users reached the final mission but did not complete it, yet in the text +% it seems these 181 users actually did. +% * The post-hoc power analysis is very useful! + +% Discussion: +% * "The new editors in our study may have had unpleasant experiences +% during their initial time on Wikipedia..." It appears that the survey +% asked no questions about this, yet is it not a very important issue +% related to TWA's success? +% * In "Limitations of gamification" the following sentence is found: +% "...our study is among the first that compares levels of participation in +% a task among individuals who were introduced to gamified learning first +% to those that were not." This is an _important_ finding, it shouldn't be +% hidden back here but instead be up front in the introduction! + +% Formatting and Reference Issues + + + +% ------------------------ Submission 516, Review 3 ------------------------ + +% Title: The Wikipedia Adventure: Field Evaluation of an Interactive Tutorial for New Users + +% Reviewer: AC-Reviewer + +% Expertise + +% 4 (Expert) + +% First Round Overall Recommendation + +% 3 (Maybe acceptable (with significant modifications)) + +% Contribution and Criteria for Evaluation + +% This paper presents the results of a deployment of a gameification-based +% system designed to retain new editors in Wikipedia. It is a negative +% results paper: the authors claim that they have conclusive evidence that +% the system did not work (although I have suggested a few additional lines +% of inquiry below that might problematize this assertion). + +% The committee will have to have a discussion about how to evaluate this +% paper, and likely negative results papers more generally. + +% Assessment of the Paper + +% This paper presents the results of a deployment of a gameification-based +% system designed to retain new editors in Wikipedia. It is a negative +% results paper: the authors claim that they have conclusive evidence that +% the system did not work (although I have suggested a few additional lines +% of inquiry below that might problematize this assertion). + +% The paper is very well-written and has some large positives. It also is a +% negative results paper, and the committee will have to decide how to +% handle this. In general, I’m strongly sympathetic to arguments to +% include more negative results papers in our proceedings, but I’m quite +% unclear on the details of how to do so (e.g. what defines a top-quality +% negative results paper?). I’m hopeful that this paper can instigate a +% broader discussion on this topic at the PC meeting. + +% All of that said, this paper also has a number of idiosyncratic +% limitations that make it perhaps not the best trial balloon for negative +% results papers. Below, I outline what I believe to be the paper’s +% positives and then describe these limitations in more detail, phrased as +% both critiques and questions. + +% Overall, my recommendation is to invite the authors to revise and +% resubmit. If this occurs, I’ll want to see the below critiques +% addressed and the below questions answered (both through direct answers +% in the response to reviewers and through clarifications and changes to +% the paper). I’m hopeful through, through the R&R process, this paper +% can become an ideal negative results trial balloon. + + +% Important positives: + +% * The authors built a system to solve a real-life problemand did a +% real-life, relatively large-scale deployment. Awesome! +% * The paper is easily in the top 95% in terms of writing quality. This is +% true both at the sentence level and at the narrative level. As a person +% who has to review lots of papers, this was a breath of fresh air. +% * The design of the game is quite well-thought-out, save a few relatively +% arbitrary decisions. I was particularly compelled by the use of +% gameification techniques that are also present in “real Wikipedia” +% (e.g. barnstar-like rewards). + +% Critiques: + +% CRITIQUE #1 – Excessive import placed on trivial self-report data: It +% is well-known that self-report data from participants is inferior to +% observations of actual behavior, and that self-report data can be quite +% unreliable more generally. As such, in my view, it is not a contribution +% to show that self-report data didn’t end up panning out in the +% behavioral results. + +% In the next draft of this paper, I would like to see the authors address +% this issue. This might mean framing this paper as a full-on negative +% results paper, but lighter weight adaptations might be possible. + + +% Open questions: + +% QUESTION #1: As noted above, this paper is a negative results paper at +% its core, and we’ll have to have a broad discussion about this at the +% PC meeting, assuming the paper makes it this far. In the event that this +% occurs, can the authors provide a more robust argument as to why these +% negative results are important for other researchers and practitioners? + +% The paper attempts to argue that one contribution that comes out of its +% negative results is to distrust self-report data, but this is well-known +% (see below). The other negative results argument in the paper is that +% these results add to growing evidence of long-term gameificiation +% failures. I find this argument much more compelling. In other words, by +% expanding on this argument, the authors may be able to address this +% question. + +% That said, regardless of how this question is addressed in the second +% draft, I’d like to see it done both through changes to the paper and +% through discussion in the response to reviewers. + +% QUESTION #2 – Is there a possibility that the statistical framework +% employed is not appropriate for this particular study? + +% The authors utilize a two-level statistical approach that I haven’t +% seen before in the CSCW/CHI literature. I enjoyed thinking about this +% approach, and the authors did a relatively good job explaining it. That +% said, I’m currently not convinced that it was the appropriate framework +% for this study. Here’s my reasoning: + +% (1) The goal here is to introduce a treatment that ultimately will +% produce strong new members of the Wikipedia community at a higher rate +% than the control. +% (2) Let’s say the game produces 3 such members out of 100 new editors +% and the control produces 1, which looks like it might be the case. +% Let’s also say that this pattern additionally persists over a large n. +% (3) If this is true, why do we care about the potentially moderating +% effect of the invitations? + +% The authors argue that new editors that responded to the invitation to +% play the game might just be new editors who are engaged and, critically, +% would have been power editors whether or not the game existed. However, +% barring a random fluke, shouldn’t these future power editors also have +% been in the control group? If I’m right here, I’m thinking the +% invitation doesn’t matter and a more traditional statistical analysis +% (or at least one targeted at identifying rare events) is appropriate. + +% I could be wrong, but I want the authors to respond to this question, +% both through feedback to reviewers and clarifications in the paper. + +% As an important side note, if we agree that this framework is the right +% way to go in the end, the authors should puff their chests more about +% this by claiming it as a contribution (assuming it hasn’t been used at +% CSCW before). + +% Question #3 – Are the outcome variables considered here the best +% outcome variables? Are some critical variables missing? + +% The authors seem focused on the average effects across the entire control +% and treatment groups (the two treatment groups, to be specific). However, +% would it not also be reasonable to consider the metric I describe above: +% the % of new editors that go on to be power editors? Since power editors +% end up contributing most of the edits anyway *over the long term*, to me +% this seems like the way to go (i.e. if this group of editors were +% followed for years, statistically significant differences would begin to +% emerge). If the authors agree, the authors need to reanalyze their data +% with this metric in mind. + +% Another related outcome variable that might be useful to analyze is how +% long the new editors in each group remained active editors in the +% community (i.e. survival analysis). Because the data is quite old, this +% should be an easy new analysis to run, and longevity has been a variable +% interest in a number of peer production studies. + +% In their second draft and the feedback to reviewers, I would like to see +% the authors discuss either new analyses related to power users or why thy +% did not consider this outcome variable. I would also like to see the same +% for survival analysis. + +% QUESTION #4: Is there a path towards positive results? + +% As noted above, I believe some discussion around this paper and negative +% results papers more generally will have to happen at the PC meeting. +% However, I think there are so missed opportunities here for positive +% results and that the authors were too quick to settle for negative +% results. This is likely an important factor to consider when deciding +% whether to accept a negative results paper. + +% Most notably, there are several, well-motivated unexplored avenues that +% could lead to positive results that would have a much larger impact than +% the negative results presented here: + +% * As noted above, examining additional outcome variables is important, +% most notably # of power editors and longevity. +% * Does the game work if folks are forced to play it prior to editing +% Wikipedia, as would be the case in most other institutionalized +% socialization contexts? This is not just a hypothetical: this game could +% be used in all Wikipedia Education Project classes and related endeavors. + + +\end{document}