pointing at updated data url, adding explicit NA handling to factor, cutting unnecessary call to ggplot2, and updated corresponding output from new data file. May not work while kibo urls are getting resolved

This commit is contained in:
aaronshaw
2020-04-01 16:52:22 -05:00
parent 4f8a698c62
commit 282588772e
2 changed files with 15 additions and 14 deletions

View File

@@ -4,13 +4,12 @@
### Minimal example analysis file using pageview data
library(tidyverse)
library(ggplot2)
library(scales)
### Import and cleanup data
### Import and cleanup one datafile from the observatory
DataURL <-
url("https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory/raw/master/wikipedia_views/data/dailyviews2020032600.tsv")
url("https://covid19.communitydata.science/datasets/wikipedia/digobs_covid19-wikipedia-enwiki_dailyviews-20200401.tsv")
views <-
read.table(DataURL, sep="\t", header=TRUE, stringsAsFactors=FALSE)
@@ -30,12 +29,14 @@ views <-
### (see https://www.tidyverse.org for more info)
views <- views[,c("article", "project", "timestamp", "views")]
views$timestamp <- factor(views$timestamp)
views$timestamp <- fct_explicit_na(views$timestamp)
### Sorts and groups at the same time
views.by.proj.date <- arrange(group_by(views, project, timestamp),
desc(views))
### Export just the top 10 by pageviews
write.table(head(views.by.proj.date, 10),
file="output/top10_views_by_project_date.csv", sep=",",