renaming example analysis directories
This commit is contained in:
@@ -0,0 +1,11 @@
|
||||
"term","date","query.1","query.2","query.3","query.4","query.5"
|
||||
"coronavirus",2020-03-27,coronavirus update,corona,coronavirus symptoms,news coronavirus,coronavirus cases
|
||||
"covid-19",2020-03-27,covid-19 coronavirus,coronavirus,covid,covid-19 cases,covid 19
|
||||
"covid-19 pandemic",2020-03-27,coronavirus,covid-19 coronavirus pandemic,coronavirus pandemic,who,is covid-19 a pandemic
|
||||
"covid19",2020-03-27,covid,covid 19,coronavirus covid19,coronavirus,covid19 cases
|
||||
"sars-cov-2",2020-03-27,coronavirus,coronavirus sars-cov-2,covid-19,covid-19 sars-cov-2,sars
|
||||
"coronavirus",2020-03-28,coronavirus update,corona,coronavirus symptoms,news coronavirus,coronavirus cases
|
||||
"covid-19",2020-03-28,coronavirus,coronavirus covid-19,covid,covid-19 cases,covid 19
|
||||
"covid-19 pandemic",2020-03-28,coronavirus pandemic,coronavirus,covid-19 coronavirus pandemic,is covid-19 a pandemic,who pandemic
|
||||
"covid19",2020-03-28,covid,covid 19,coronavirus covid19,coronavirus,covid19 cases
|
||||
"sars-cov-2",2020-03-28,coronavirus sars-cov-2,coronavirus,sars-cov-2 covid-19,covid-19,sars
|
||||
|
BIN
keywords/example_analysis/output/top_queries_plot.png
Normal file
BIN
keywords/example_analysis/output/top_queries_plot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 48 KiB |
36
keywords/example_analysis/related_searches_example.R
Normal file
36
keywords/example_analysis/related_searches_example.R
Normal file
@@ -0,0 +1,36 @@
|
||||
### COVID-19 Digital Observatory
|
||||
### 2020-03-28
|
||||
###
|
||||
### Minimal example analysis file using trending search data
|
||||
|
||||
### Import and cleanup data
|
||||
|
||||
DataURL <-
|
||||
url("https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory/blob/master/transliterations/data/output/related_searches_top.csv")
|
||||
|
||||
related.searches.top <- read.table(DataURL,
|
||||
sep=",", header=TRUE,
|
||||
stringsAsFactors=FALSE)
|
||||
|
||||
### Alternatively, uncomment and run if working locally with full git tree
|
||||
### Identify data source directory and file
|
||||
## DataDir <- ("../data/output/")
|
||||
## DataFile <- ("related_searches_top.csv")
|
||||
|
||||
## related.searches.top <- read.table(paste(DataDir,DataFile, sep=""),
|
||||
## sep=",", header=TRUE,
|
||||
## stringsAsFactors=FALSE)
|
||||
|
||||
### Aggregate top 5 search queries by term/day
|
||||
top5.per.term.date <- aggregate(query ~ term + date,
|
||||
data=related.searches.top,
|
||||
head, 5)
|
||||
|
||||
## Might cleanup a bit for further analysis or visualization...
|
||||
top5.per.term.date$date <- as.Date(top5.per.term.date$date)
|
||||
|
||||
### Export
|
||||
write.table(top5.per.term.date,
|
||||
file="output/top5_queries_per_term_per_date.csv", sep=",",
|
||||
row.names=FALSE)
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
### COVID-19 Digital Observatory
|
||||
### 2020-03-28
|
||||
###
|
||||
### Minimal example analysis file using trending search data
|
||||
|
||||
library(tidyverse)
|
||||
|
||||
### Import and cleanup data
|
||||
|
||||
|
||||
related.searches.top = read_csv("https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory/raw/master/keywords/output/intermediate/related_searches_top.csv")
|
||||
|
||||
|
||||
## Plot how often the top 10 queries appear in the top 10 suggested list each day
|
||||
|
||||
plot <- related.searches.top %>%
|
||||
group_by(term, date) %>% # Group by term and date
|
||||
arrange(-value) %>% # Sort by value (this should already be done anyway)
|
||||
top_n(10) %>% # Get the top 10 queries for each term-day pair
|
||||
group_by(query) %>% # Group by again, this time for each query
|
||||
summarize(appearances = n()) %>% # Count how often this query appears in the top 10 (which is how many Google displays)
|
||||
arrange(-appearances) %>% # Sort by appearances
|
||||
top_n(10) %>% # And get the top 10 queries
|
||||
ggplot(aes(x=reorder(query, appearances), y=appearances)) + # Plot the number of appearances, ordered by appearances
|
||||
geom_bar(stat = 'identity') + # Tell R that we want to use the values of `appearances` as the counts
|
||||
coord_flip() + # Flip the plot
|
||||
xlab("Query") +
|
||||
ylab("Number of appearances in top 10 suggested queries") +
|
||||
theme_minimal() # And make it minimal
|
||||
|
||||
ggsave('./output/top_queries_plot.png', plot)
|
||||
17
keywords/example_analysis/translations_example.R
Normal file
17
keywords/example_analysis/translations_example.R
Normal file
@@ -0,0 +1,17 @@
|
||||
## example reading latest file straight from the server
|
||||
df <- read.csv("https://covid19.communitydata.science/datasets/keywords/csv/latest.csv")
|
||||
|
||||
## make the data more R-friendly
|
||||
df$is.alt <- df$is_alt == "True"
|
||||
df$is_alt <- NULL
|
||||
|
||||
## find all translations for coronavirus
|
||||
coronavirus.itemids <- df[ (tolower(df$label) == "coronavirus") &
|
||||
(df$langcode == 'en')
|
||||
,"itemid"]
|
||||
|
||||
## there are actually 5 item ids. The one referring to the family of virus is Q57751738
|
||||
coronavirus.translations <- df[df$itemid == "http://www.wikidata.org/entity/Q57751738",]
|
||||
|
||||
## let's only look at non-aliases
|
||||
print(coronavirus.translations[c(coronavirus.translations$is.alt == FALSE), c("label","langcode")])
|
||||
13
keywords/example_analysis/translations_example.py
Normal file
13
keywords/example_analysis/translations_example.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import pandas as pd
|
||||
|
||||
# read the latest dataset
|
||||
df = pd.read_csv("https://covid19.communitydata.science/datasets/keywords/csv/latest.csv")
|
||||
|
||||
# find translations of "coronavirus"
|
||||
coronavirus_itemids = df.loc[df.label.str.lower() == "coronavirus"]
|
||||
|
||||
# there are actually 5 item ids. The one referring to the family of virus is Q57751738
|
||||
coronavirus_translations = df.loc[df.itemid == "http://www.wikidata.org/entity/Q57751738"]
|
||||
|
||||
# let's only look at unique, non-aliases
|
||||
print(coronavirus_translations.loc[df.is_alt == False,['label','langcode']])
|
||||
Reference in New Issue
Block a user