Merge pull request #9 from aaronshaw/master

minimal analysis example with pageview data
2020-03-28 20:42:40 -05:00 · 2020-03-28 20:42:40 -05:00 · 05b8025e15
commit 05b8025e15
parent c0e50fe297 5dfbe3dab4
2 changed files with 62 additions and 0 deletions
--- a/wikipedia_views/analysis/output/top10_views_by_project_date.csv
+++ b/wikipedia_views/analysis/output/top10_views_by_project_date.csv
@ -0,0 +1,11 @@
 "article","project","timestamp","views"
 "2019–20_coronavirus_pandemic","en.wikipedia","2020032600",1148284
 "2020_coronavirus_pandemic_in_India","en.wikipedia","2020032600",513901
 "Coronavirus","en.wikipedia","2020032600",397959
 "2020_coronavirus_pandemic_in_the_United_States","en.wikipedia","2020032600",337676
 "2019–20_coronavirus_pandemic_by_country_and_territory","en.wikipedia","2020032600",298603
 "2020_coronavirus_pandemic_in_Italy","en.wikipedia","2020032600",297687
 "Coronavirus_disease_2019","en.wikipedia","2020032600",292272
 "2020_coronavirus_pandemic_in_Spain","en.wikipedia","2020032600",114732
 "2020_coronavirus_pandemic_in_the_United_Kingdom","en.wikipedia","2020032600",111856
 "Anthony_Fauci","en.wikipedia","2020032600",103205
--- a/wikipedia_views/analysis/pageview_example.R
+++ b/wikipedia_views/analysis/pageview_example.R
@ -0,0 +1,51 @@
 ### COVID-19 Digital Observatory
 ### 2020-03-28
 ### 
 ### Minimal example analysis file using pageview data
 library(tidyverse)
 library(ggplot2)
 library(scales)
 ### Import and cleanup data
 DataURL <-
    url("https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory/raw/master/wikipedia_views/data/dailyviews2020032600.tsv")
 views <-
    read.table(DataURL, sep="\t", header=TRUE, stringsAsFactors=FALSE) 
 ### Alternatively, uncomment and run if working locally with full git
 ### tree
 ###
 ### Identify data source directory and file
 ## DataDir <- ("../data/")
 ## DataFile <- ("dailyviews2020032600.tsv")
 ## related.searches.top <- read.table(paste(DataDir,DataFile, sep=""),
 ##                                   sep="\t", header=TRUE,
 ##                                   stringsAsFactors=FALSE)
 ### Cleanup and do the grouping with functions from the Tidyverse
 ### (see https://www.tidyverse.org for more info)
 views <- views[,c("article", "project", "timestamp", "views")]
 views$timestamp <- factor(views$timestamp)
 ### Sorts and groups at the same time
 views.by.proj.date <- arrange(group_by(views, project, timestamp),
                        desc(views))
 ### Export just the top 10 by pageviews
 write.table(head(views.by.proj.date, 10),
            file="output/top10_views_by_project_date.csv", sep=",",
            row.names=FALSE)
 ### A simple visualization
 p <- ggplot(data=views.by.proj.date, aes(views))
 ## Density plot with log-transformed axis
 p + geom_density() + scale_x_log10(labels=comma)