pointing at updated data url, adding explicit NA handling to factor, cutting unnecessary call to ggplot2, and updated corresponding output from new data file. May not work while kibo urls are getting resolved

This commit is contained in:
aaronshaw 2020-04-01 16:52:22 -05:00
parent 4f8a698c62
commit 282588772e
2 changed files with 15 additions and 14 deletions

View File

@ -1,11 +1,11 @@
"article","project","timestamp","views" "article","project","timestamp","views"
"201920_coronavirus_pandemic","en.wikipedia","2020032600",1148284 "201920_coronavirus_pandemic","en.wikipedia","2020033100",831879
"2020_coronavirus_pandemic_in_India","en.wikipedia","2020032600",513901 "2020_coronavirus_pandemic_in_India","en.wikipedia","2020033100",323123
"Coronavirus","en.wikipedia","2020032600",397959 "201920_coronavirus_pandemic_by_country_and_territory","en.wikipedia","2020033100",315572
"2020_coronavirus_pandemic_in_the_United_States","en.wikipedia","2020032600",337676 "2020_coronavirus_pandemic_in_the_United_States","en.wikipedia","2020033100",290535
"201920_coronavirus_pandemic_by_country_and_territory","en.wikipedia","2020032600",298603 "Coronavirus_disease_2019","en.wikipedia","2020033100",211391
"2020_coronavirus_pandemic_in_Italy","en.wikipedia","2020032600",297687 "2020_coronavirus_pandemic_in_Italy","en.wikipedia","2020033100",209908
"Coronavirus_disease_2019","en.wikipedia","2020032600",292272 "Coronavirus","en.wikipedia","2020033100",188921
"2020_coronavirus_pandemic_in_Spain","en.wikipedia","2020032600",114732 "USNS_Comfort_(T-AH-20)","en.wikipedia","2020033100",150422
"2020_coronavirus_pandemic_in_the_United_Kingdom","en.wikipedia","2020032600",111856 "USNS_Comfort_(T-AH-20)","en.wikipedia","2020033100",150422
"Anthony_Fauci","en.wikipedia","2020032600",103205 "WrestleMania_36","en.wikipedia","2020033100",137637

1 article project timestamp views
2 2019–20_coronavirus_pandemic en.wikipedia 2020032600 2020033100 1148284 831879
3 2020_coronavirus_pandemic_in_India en.wikipedia 2020032600 2020033100 513901 323123
4 Coronavirus 2019–20_coronavirus_pandemic_by_country_and_territory en.wikipedia 2020032600 2020033100 397959 315572
5 2020_coronavirus_pandemic_in_the_United_States en.wikipedia 2020032600 2020033100 337676 290535
6 2019–20_coronavirus_pandemic_by_country_and_territory Coronavirus_disease_2019 en.wikipedia 2020032600 2020033100 298603 211391
7 2020_coronavirus_pandemic_in_Italy en.wikipedia 2020032600 2020033100 297687 209908
8 Coronavirus_disease_2019 Coronavirus en.wikipedia 2020032600 2020033100 292272 188921
9 2020_coronavirus_pandemic_in_Spain USNS_Comfort_(T-AH-20) en.wikipedia 2020032600 2020033100 114732 150422
10 2020_coronavirus_pandemic_in_the_United_Kingdom USNS_Comfort_(T-AH-20) en.wikipedia 2020032600 2020033100 111856 150422
11 Anthony_Fauci WrestleMania_36 en.wikipedia 2020032600 2020033100 103205 137637

View File

@ -4,13 +4,12 @@
### Minimal example analysis file using pageview data ### Minimal example analysis file using pageview data
library(tidyverse) library(tidyverse)
library(ggplot2)
library(scales) library(scales)
### Import and cleanup data ### Import and cleanup one datafile from the observatory
DataURL <- DataURL <-
url("https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory/raw/master/wikipedia_views/data/dailyviews2020032600.tsv") url("https://covid19.communitydata.science/datasets/wikipedia/digobs_covid19-wikipedia-enwiki_dailyviews-20200401.tsv")
views <- views <-
read.table(DataURL, sep="\t", header=TRUE, stringsAsFactors=FALSE) read.table(DataURL, sep="\t", header=TRUE, stringsAsFactors=FALSE)
@ -30,12 +29,14 @@ views <-
### (see https://www.tidyverse.org for more info) ### (see https://www.tidyverse.org for more info)
views <- views[,c("article", "project", "timestamp", "views")] views <- views[,c("article", "project", "timestamp", "views")]
views$timestamp <- factor(views$timestamp) views$timestamp <- fct_explicit_na(views$timestamp)
### Sorts and groups at the same time ### Sorts and groups at the same time
views.by.proj.date <- arrange(group_by(views, project, timestamp), views.by.proj.date <- arrange(group_by(views, project, timestamp),
desc(views)) desc(views))
### Export just the top 10 by pageviews ### Export just the top 10 by pageviews
write.table(head(views.by.proj.date, 10), write.table(head(views.by.proj.date, 10),
file="output/top10_views_by_project_date.csv", sep=",", file="output/top10_views_by_project_date.csv", sep=",",