Merge branch 'master' of https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory
This commit is contained in:
commit
7b3062ffb1
3
transliterations/README.md
Normal file
3
transliterations/README.md
Normal file
@ -0,0 +1,3 @@
|
||||
# Transliterations
|
||||
|
||||
This part of the project collects tranliterations of key phrases related to COVID-19 using Wikidata. We search the Wikidata API for entities in `src/wikidata_search.py` and then we make simple SPARQL queries in `src/wikidata_transliterations.py` to collect labels and aliases the entities. The labels come with language metadata. This seems to provide a decent initial list of relevant terms across multiple languages.
|
@ -0,0 +1,6 @@
|
||||
"term","date","query.1","query.2","query.3","query.4","query.5"
|
||||
"coronavirus","2020-03-27",coronavirus update,corona,coronavirus symptoms,news coronavirus,coronavirus cases
|
||||
"covid-19","2020-03-27",covid-19 coronavirus,coronavirus,covid,covid-19 cases,covid 19
|
||||
"covid-19 pandemic","2020-03-27",coronavirus,covid-19 coronavirus pandemic,coronavirus pandemic,who,is covid-19 a pandemic
|
||||
"covid19","2020-03-27",covid,covid 19,coronavirus covid19,coronavirus,covid19 cases
|
||||
"sars-cov-2","2020-03-27",coronavirus,coronavirus sars-cov-2,covid-19,covid-19 sars-cov-2,sars
|
|
28
transliterations/analysis/related_searches_example.R
Normal file
28
transliterations/analysis/related_searches_example.R
Normal file
@ -0,0 +1,28 @@
|
||||
### COVID-19 Digital Observatory
|
||||
### 2020-03-28
|
||||
###
|
||||
### Minimal example analysis file using trending search data
|
||||
|
||||
### Identify data source directory and file
|
||||
DataDir <- ("../data/output/")
|
||||
DataFile <- ("related_searches_top.csv")
|
||||
|
||||
### Import and cleanup data
|
||||
related.searches.top <- read.table(paste(DataDir,DataFile,
|
||||
sep=""),
|
||||
sep=",", header=TRUE,
|
||||
stringsAsFactors=FALSE)
|
||||
|
||||
### Aggregate top 5 search queries by term/day
|
||||
top5.per.term.date <- aggregate(query ~ term + date,
|
||||
data=related.searches.top,
|
||||
head, 5)
|
||||
|
||||
## Might cleanup a bit for further analysis or visualization...
|
||||
top5.per.term.date$date <- as.Date(top5.per.term.date$date)
|
||||
|
||||
### Export
|
||||
write.table(top5.per.term.date,
|
||||
file="output/top5_queries_per_term_per_date.csv", sep=",",
|
||||
row.names=FALSE)
|
||||
|
6
transliterations/data/input/base_terms.txt
Normal file
6
transliterations/data/input/base_terms.txt
Normal file
@ -0,0 +1,6 @@
|
||||
coronavirus
|
||||
covid-19
|
||||
covid19
|
||||
sars-cov-2
|
||||
covid-19 pandemic
|
||||
sars-cov-2 pandemic
|
9300
transliterations/data/output/2020-03-27_wikidata_entity_labels.csv
Normal file
9300
transliterations/data/output/2020-03-27_wikidata_entity_labels.csv
Normal file
File diff suppressed because it is too large
Load Diff
10806
transliterations/data/output/2020-03-28_wikidata_entity_labels.csv
Normal file
10806
transliterations/data/output/2020-03-28_wikidata_entity_labels.csv
Normal file
File diff suppressed because it is too large
Load Diff
36
transliterations/data/output/daily_google_trends.csv
Normal file
36
transliterations/data/output/daily_google_trends.csv
Normal file
@ -0,0 +1,36 @@
|
||||
date,term,top
|
||||
2020-03-27,COVID-19,0
|
||||
2020-03-27,Boris Johnson,1
|
||||
2020-03-27,Thomas Massie,2
|
||||
2020-03-27,Ozark,3
|
||||
2020-03-27,Drew Brees,4
|
||||
2020-03-27,Bill Gates,5
|
||||
2020-03-27,Kourtney Kardashian,6
|
||||
2020-03-27,Doris Burke,7
|
||||
2020-03-27,Pope Francis,8
|
||||
2020-03-27,Krispy Kreme,9
|
||||
2020-03-27,Roman Reigns,10
|
||||
2020-03-27,A Million Little Things,11
|
||||
2020-03-27,Pearl Jam Gigaton,12
|
||||
2020-03-27,Gretchen Whitmer,13
|
||||
2020-03-27,Bill Ackman,14
|
||||
2020-03-27,Till Lindemann,15
|
||||
2020-03-27,Trish Regan,16
|
||||
2020-03-27,Steven Universe Future,17
|
||||
2020-03-27,Urbi et Orbi,18
|
||||
2020-03-27,Nc stay-at-home order,19
|
||||
2020-03-28,Dog,0
|
||||
2020-03-28,Tom Coburn,1
|
||||
2020-03-28,Cat,2
|
||||
2020-03-28,Weather radar,3
|
||||
2020-03-28,Horse,4
|
||||
2020-03-28,Elephant,5
|
||||
2020-03-28,Bear,6
|
||||
2020-03-28,Penguin,7
|
||||
2020-03-28,Joseph Lowery,8
|
||||
2020-03-28,North Carolina,9
|
||||
2020-03-28,Wolf,10
|
||||
2020-03-28,Predator: Hunting Grounds,11
|
||||
2020-03-28,Duck,12
|
||||
2020-03-28,Vivarium,13
|
||||
2020-03-28,Instacart strike,14
|
|
240
transliterations/data/output/related_searches_rising.csv
Normal file
240
transliterations/data/output/related_searches_rising.csv
Normal file
@ -0,0 +1,240 @@
|
||||
query,value,term,date
|
||||
coronavirus tips,102600,coronavirus,2020-03-27
|
||||
covid 19,72600,coronavirus,2020-03-27
|
||||
trump coronavirus,52900,coronavirus,2020-03-27
|
||||
notizie coronavirus,50350,coronavirus,2020-03-27
|
||||
wuhan coronavirus,48850,coronavirus,2020-03-27
|
||||
ultime coronavirus,45300,coronavirus,2020-03-27
|
||||
coronavirus ultime notizie,39600,coronavirus,2020-03-27
|
||||
coronavirus worldometer,37350,coronavirus,2020-03-27
|
||||
us coronavirus cases,36250,coronavirus,2020-03-27
|
||||
italie coronavirus,34850,coronavirus,2020-03-27
|
||||
coronavirus update india,25550,coronavirus,2020-03-27
|
||||
coronavirus in italia,25500,coronavirus,2020-03-27
|
||||
covid-19,24150,coronavirus,2020-03-27
|
||||
coronavirus live map,23700,coronavirus,2020-03-27
|
||||
john hopkins coronavirus,23650,coronavirus,2020-03-27
|
||||
coronavirus lombardia,23400,coronavirus,2020-03-27
|
||||
coronavirus meme,23050,coronavirus,2020-03-27
|
||||
coronavirus live update,23050,coronavirus,2020-03-27
|
||||
coronavirus update uk,20950,coronavirus,2020-03-27
|
||||
decreto coronavirus,19750,coronavirus,2020-03-27
|
||||
coronavirus österreich,19250,coronavirus,2020-03-27
|
||||
coronavirus roma,18900,coronavirus,2020-03-27
|
||||
coronavirus memes,17350,coronavirus,2020-03-27
|
||||
coronavirus cases in india,16050,coronavirus,2020-03-27
|
||||
coronavirus georgia,16050,coronavirus,2020-03-27
|
||||
covid-19 coronavirus,1218150,covid-19,2020-03-27
|
||||
coronavirus,1181150,covid-19,2020-03-27
|
||||
covid,682700,covid-19,2020-03-27
|
||||
covid-19 cases,605100,covid-19,2020-03-27
|
||||
covid 19,473950,covid-19,2020-03-27
|
||||
corona,405300,covid-19,2020-03-27
|
||||
virus covid-19,398550,covid-19,2020-03-27
|
||||
covid-19 map,273250,covid-19,2020-03-27
|
||||
covid-19 symptoms,271350,covid-19,2020-03-27
|
||||
covid-19 news,247000,covid-19,2020-03-27
|
||||
covid-19 updates,238250,covid-19,2020-03-27
|
||||
covid-19 update,213450,covid-19,2020-03-27
|
||||
covid-19 live,203950,covid-19,2020-03-27
|
||||
corona virus,201950,covid-19,2020-03-27
|
||||
covid-19 us,185450,covid-19,2020-03-27
|
||||
who,169900,covid-19,2020-03-27
|
||||
covid-19 who,169200,covid-19,2020-03-27
|
||||
covid-19 canada,167650,covid-19,2020-03-27
|
||||
what is covid-19,165850,covid-19,2020-03-27
|
||||
covid-19 cdc,143000,covid-19,2020-03-27
|
||||
cdc,139750,covid-19,2020-03-27
|
||||
covid-19 test,136250,covid-19,2020-03-27
|
||||
covid-19 italy,134850,covid-19,2020-03-27
|
||||
covid-19 china,128200,covid-19,2020-03-27
|
||||
covid-19 usa,118050,covid-19,2020-03-27
|
||||
covid,1438650,covid19,2020-03-27
|
||||
covid 19,1282950,covid19,2020-03-27
|
||||
coronavirus covid19,950500,covid19,2020-03-27
|
||||
coronavirus,930200,covid19,2020-03-27
|
||||
covid19 cases,496800,covid19,2020-03-27
|
||||
corona,395600,covid19,2020-03-27
|
||||
corona covid19,390150,covid19,2020-03-27
|
||||
covid19 virus,368100,covid19,2020-03-27
|
||||
covid19 news,253550,covid19,2020-03-27
|
||||
covid19 update,223300,covid19,2020-03-27
|
||||
symptoms covid19,222750,covid19,2020-03-27
|
||||
covid19 symptoms,220150,covid19,2020-03-27
|
||||
covid19 map,217850,covid19,2020-03-27
|
||||
corona virus,206600,covid19,2020-03-27
|
||||
covid19 us,178600,covid19,2020-03-27
|
||||
what is covid19,173100,covid19,2020-03-27
|
||||
covid19 who,159800,covid19,2020-03-27
|
||||
who,156600,covid19,2020-03-27
|
||||
canada covid19,155350,covid19,2020-03-27
|
||||
italy covid19,154100,covid19,2020-03-27
|
||||
italy,147850,covid19,2020-03-27
|
||||
test covid19,146500,covid19,2020-03-27
|
||||
china covid19,144300,covid19,2020-03-27
|
||||
covid19 uk,130850,covid19,2020-03-27
|
||||
covid-19,129800,covid19,2020-03-27
|
||||
coronavirus,707750,sars-cov-2,2020-03-27
|
||||
coronavirus sars-cov-2,687850,sars-cov-2,2020-03-27
|
||||
covid-19,568100,sars-cov-2,2020-03-27
|
||||
covid-19 sars-cov-2,538200,sars-cov-2,2020-03-27
|
||||
sars,346250,sars-cov-2,2020-03-27
|
||||
virus sars-cov-2,306300,sars-cov-2,2020-03-27
|
||||
corona,241200,sars-cov-2,2020-03-27
|
||||
sars-cov-2 covid 19,224250,sars-cov-2,2020-03-27
|
||||
covid 19,210650,sars-cov-2,2020-03-27
|
||||
corona virus,121450,sars-cov-2,2020-03-27
|
||||
sars-cov-2 vs covid-19,107850,sars-cov-2,2020-03-27
|
||||
what is sars-cov-2,107650,sars-cov-2,2020-03-27
|
||||
sars cov 2,91850,sars-cov-2,2020-03-27
|
||||
sars-cov-2 wiki,56650,sars-cov-2,2020-03-27
|
||||
koronawirus,47000,sars-cov-2,2020-03-27
|
||||
cdc,44300,sars-cov-2,2020-03-27
|
||||
sars-cov,44100,sars-cov-2,2020-03-27
|
||||
sars-cov-1,41000,sars-cov-2,2020-03-27
|
||||
sars-cov-2 vs cod-19,28450,sars-cov-2,2020-03-27
|
||||
sars-cov-2 genome,22300,sars-cov-2,2020-03-27
|
||||
pubmed,22200,sars-cov-2,2020-03-27
|
||||
коронавирус,21950,sars-cov-2,2020-03-27
|
||||
sars-cov-2 or covid-19,19150,sars-cov-2,2020-03-27
|
||||
on the origin and continuing evolution of sars-cov-2,18950,sars-cov-2,2020-03-27
|
||||
the proximal origin of sars-cov-2,18800,sars-cov-2,2020-03-27
|
||||
coronavirus,707350,covid-19 pandemic,2020-03-27
|
||||
covid-19 coronavirus pandemic,682850,covid-19 pandemic,2020-03-27
|
||||
coronavirus pandemic,655700,covid-19 pandemic,2020-03-27
|
||||
who,288300,covid-19 pandemic,2020-03-27
|
||||
is covid-19 a pandemic,269250,covid-19 pandemic,2020-03-27
|
||||
who pandemic,262250,covid-19 pandemic,2020-03-27
|
||||
covid 19 pandemic,248100,covid-19 pandemic,2020-03-27
|
||||
pandemic meaning,140650,covid-19 pandemic,2020-03-27
|
||||
what is a pandemic,107950,covid-19 pandemic,2020-03-27
|
||||
pandemic vs epidemic,98050,covid-19 pandemic,2020-03-27
|
||||
cdc pandemic,97750,covid-19 pandemic,2020-03-27
|
||||
pandemic definition,94750,covid-19 pandemic,2020-03-27
|
||||
pandemic define,61950,covid-19 pandemic,2020-03-27
|
||||
covid-19 pandemic plan,58650,covid-19 pandemic,2020-03-27
|
||||
coronavirus pandemic covid-19 live world map/count,55350,covid-19 pandemic,2020-03-27
|
||||
who declared covid-19 pandemic,52050,covid-19 pandemic,2020-03-27
|
||||
covid-19 pandemic unemployment payment,51900,covid-19 pandemic,2020-03-27
|
||||
coronavirus update,32550,covid-19 pandemic,2020-03-27
|
||||
coronavirus tips,109950,coronavirus,2020-03-28
|
||||
covid,97600,coronavirus,2020-03-28
|
||||
coronavirus death toll,52800,coronavirus,2020-03-28
|
||||
wuhan coronavirus,49100,coronavirus,2020-03-28
|
||||
coronavirus argentina,40350,coronavirus,2020-03-28
|
||||
coronavirus ultime notizie,39400,coronavirus,2020-03-28
|
||||
coronavirus worldometer,37050,coronavirus,2020-03-28
|
||||
worldometer coronavirus,36750,coronavirus,2020-03-28
|
||||
us coronavirus cases,35250,coronavirus,2020-03-28
|
||||
ultima hora coronavirus,30900,coronavirus,2020-03-28
|
||||
coronavirus update india,26000,coronavirus,2020-03-28
|
||||
coronavirus live map,24500,coronavirus,2020-03-28
|
||||
coronavirus lombardia,23350,coronavirus,2020-03-28
|
||||
coronavirus live update,22300,coronavirus,2020-03-28
|
||||
coronavirus romania,21100,coronavirus,2020-03-28
|
||||
coronavirus update uk,21050,coronavirus,2020-03-28
|
||||
decreto coronavirus,20650,coronavirus,2020-03-28
|
||||
coronavirus österreich,19650,coronavirus,2020-03-28
|
||||
coronavirus colorado,17900,coronavirus,2020-03-28
|
||||
coronavirus memes,17700,coronavirus,2020-03-28
|
||||
coronavirus milano,17050,coronavirus,2020-03-28
|
||||
virginia coronavirus,16950,coronavirus,2020-03-28
|
||||
coronavirus tracker,16850,coronavirus,2020-03-28
|
||||
coronavirus cases in india,16600,coronavirus,2020-03-28
|
||||
coronavirus conseils,16300,coronavirus,2020-03-28
|
||||
coronavirus,1175050,covid-19,2020-03-28
|
||||
coronavirus covid-19,1169000,covid-19,2020-03-28
|
||||
covid,644400,covid-19,2020-03-28
|
||||
covid-19 cases,603750,covid-19,2020-03-28
|
||||
covid 19,452650,covid-19,2020-03-28
|
||||
corona,395750,covid-19,2020-03-28
|
||||
covid-19 virus,386500,covid-19,2020-03-28
|
||||
covid-19 symptoms,290450,covid-19,2020-03-28
|
||||
covid-19 map,283950,covid-19,2020-03-28
|
||||
covid-19 news,238050,covid-19,2020-03-28
|
||||
covid-19 updates,225500,covid-19,2020-03-28
|
||||
covid-19 update,212000,covid-19,2020-03-28
|
||||
covid-19 live,203750,covid-19,2020-03-28
|
||||
corona virus,198650,covid-19,2020-03-28
|
||||
covid-19 us,186600,covid-19,2020-03-28
|
||||
covid-19 who,173150,covid-19,2020-03-28
|
||||
what is covid-19,170900,covid-19,2020-03-28
|
||||
canada covid-19,168900,covid-19,2020-03-28
|
||||
who,168150,covid-19,2020-03-28
|
||||
covid-19 canada,165250,covid-19,2020-03-28
|
||||
cdc covid-19,145600,covid-19,2020-03-28
|
||||
cdc,140700,covid-19,2020-03-28
|
||||
covid-19 test,139050,covid-19,2020-03-28
|
||||
covid-19 world,132150,covid-19,2020-03-28
|
||||
china covid-19,129200,covid-19,2020-03-28
|
||||
covid,1482050,covid19,2020-03-28
|
||||
covid 19,1260000,covid19,2020-03-28
|
||||
coronavirus covid19,917000,covid19,2020-03-28
|
||||
coronavirus,905050,covid19,2020-03-28
|
||||
covid19 cases,514800,covid19,2020-03-28
|
||||
corona,412650,covid19,2020-03-28
|
||||
covid19 virus,384000,covid19,2020-03-28
|
||||
covid19 news,274300,covid19,2020-03-28
|
||||
symptoms covid19,231450,covid19,2020-03-28
|
||||
covid19 update,229800,covid19,2020-03-28
|
||||
covid19 map,225450,covid19,2020-03-28
|
||||
corona virus,205550,covid19,2020-03-28
|
||||
covid19 us,183000,covid19,2020-03-28
|
||||
what is covid19,168150,covid19,2020-03-28
|
||||
covid19 who,164550,covid19,2020-03-28
|
||||
who,161750,covid19,2020-03-28
|
||||
covid19 canada,153250,covid19,2020-03-28
|
||||
italy covid19,150050,covid19,2020-03-28
|
||||
covid19 test,145450,covid19,2020-03-28
|
||||
italy,145300,covid19,2020-03-28
|
||||
china covid19,143450,covid19,2020-03-28
|
||||
uk covid19,133900,covid19,2020-03-28
|
||||
covid-19,126550,covid19,2020-03-28
|
||||
usa covid19,121450,covid19,2020-03-28
|
||||
covid19 deaths,116200,covid19,2020-03-28
|
||||
coronavirus sars-cov-2,705200,sars-cov-2,2020-03-28
|
||||
coronavirus,691350,sars-cov-2,2020-03-28
|
||||
sars-cov-2 covid-19,549050,sars-cov-2,2020-03-28
|
||||
covid-19,482800,sars-cov-2,2020-03-28
|
||||
sars,326400,sars-cov-2,2020-03-28
|
||||
virus sars-cov-2,305300,sars-cov-2,2020-03-28
|
||||
corona,261700,sars-cov-2,2020-03-28
|
||||
covid 19,250950,sars-cov-2,2020-03-28
|
||||
sars-cov-2 covid 19,245900,sars-cov-2,2020-03-28
|
||||
who,118700,sars-cov-2,2020-03-28
|
||||
corona virus,105000,sars-cov-2,2020-03-28
|
||||
covid19,92650,sars-cov-2,2020-03-28
|
||||
sars-cov-2 vs covid-19,92200,sars-cov-2,2020-03-28
|
||||
what is sars-cov-2,90550,sars-cov-2,2020-03-28
|
||||
sars cov 2,75800,sars-cov-2,2020-03-28
|
||||
sars-cov-2 wiki,62400,sars-cov-2,2020-03-28
|
||||
koronawirus,50900,sars-cov-2,2020-03-28
|
||||
sars-cov,50500,sars-cov-2,2020-03-28
|
||||
mers,50300,sars-cov-2,2020-03-28
|
||||
sars-cov-2 symptoms,47500,sars-cov-2,2020-03-28
|
||||
pubmed,25200,sars-cov-2,2020-03-28
|
||||
sars-cov-2 vs cod-19,25150,sars-cov-2,2020-03-28
|
||||
on the origin and continuing evolution of sars-cov-2,22300,sars-cov-2,2020-03-28
|
||||
the proximal origin of sars-cov-2,22250,sars-cov-2,2020-03-28
|
||||
sars2,19050,sars-cov-2,2020-03-28
|
||||
coronavirus pandemic,665900,covid-19 pandemic,2020-03-28
|
||||
coronavirus,648150,covid-19 pandemic,2020-03-28
|
||||
covid-19 coronavirus pandemic,621900,covid-19 pandemic,2020-03-28
|
||||
is covid-19 a pandemic,339100,covid-19 pandemic,2020-03-28
|
||||
who pandemic,284050,covid-19 pandemic,2020-03-28
|
||||
who,269600,covid-19 pandemic,2020-03-28
|
||||
covid 19,221350,covid-19 pandemic,2020-03-28
|
||||
covid 19 pandemic,221150,covid-19 pandemic,2020-03-28
|
||||
epidemic,168100,covid-19 pandemic,2020-03-28
|
||||
pandemic meaning,142000,covid-19 pandemic,2020-03-28
|
||||
cdc,115650,covid-19 pandemic,2020-03-28
|
||||
what is a pandemic,102200,covid-19 pandemic,2020-03-28
|
||||
pandemic definition,82550,covid-19 pandemic,2020-03-28
|
||||
covid-19 symptoms,75350,covid-19 pandemic,2020-03-28
|
||||
coronavirus pandemic covid-19 live world map/count,52600,covid-19 pandemic,2020-03-28
|
||||
covid-19 updates,49400,covid-19 pandemic,2020-03-28
|
||||
covid-19 pandemic unemployment payment,39300,covid-19 pandemic,2020-03-28
|
||||
covid-19 pandemic unemployment,36150,covid-19 pandemic,2020-03-28
|
||||
pandemic vs endemic,29600,covid-19 pandemic,2020-03-28
|
||||
who declared covid-19 pandemic,29550,covid-19 pandemic,2020-03-28
|
||||
when was the last pandemic,19550,covid-19 pandemic,2020-03-28
|
|
240
transliterations/data/output/related_searches_top.csv
Normal file
240
transliterations/data/output/related_searches_top.csv
Normal file
@ -0,0 +1,240 @@
|
||||
query,value,term,date
|
||||
coronavirus update,100,coronavirus,2020-03-27
|
||||
corona,94,coronavirus,2020-03-27
|
||||
coronavirus symptoms,82,coronavirus,2020-03-27
|
||||
news coronavirus,74,coronavirus,2020-03-27
|
||||
coronavirus cases,69,coronavirus,2020-03-27
|
||||
uk coronavirus,53,coronavirus,2020-03-27
|
||||
corona virus,49,coronavirus,2020-03-27
|
||||
el coronavirus,43,coronavirus,2020-03-27
|
||||
coronavirus china,41,coronavirus,2020-03-27
|
||||
coronavirus italia,41,coronavirus,2020-03-27
|
||||
coronavirus map,40,coronavirus,2020-03-27
|
||||
india coronavirus,38,coronavirus,2020-03-27
|
||||
coronavirus france,36,coronavirus,2020-03-27
|
||||
coronavirus sintomas,31,coronavirus,2020-03-27
|
||||
coronavirus italy,29,coronavirus,2020-03-27
|
||||
italy,29,coronavirus,2020-03-27
|
||||
us coronavirus,28,coronavirus,2020-03-27
|
||||
usa coronavirus,28,coronavirus,2020-03-27
|
||||
coronavirus españa,27,coronavirus,2020-03-27
|
||||
symptoms of coronavirus,26,coronavirus,2020-03-27
|
||||
coronavirus live,25,coronavirus,2020-03-27
|
||||
coronavirus tips,21,coronavirus,2020-03-27
|
||||
what is coronavirus,21,coronavirus,2020-03-27
|
||||
coronavirus in india,20,coronavirus,2020-03-27
|
||||
coronavirus latest,19,coronavirus,2020-03-27
|
||||
covid-19 coronavirus,100,covid-19,2020-03-27
|
||||
coronavirus,97,covid-19,2020-03-27
|
||||
covid,56,covid-19,2020-03-27
|
||||
covid-19 cases,50,covid-19,2020-03-27
|
||||
covid 19,39,covid-19,2020-03-27
|
||||
corona,33,covid-19,2020-03-27
|
||||
virus covid-19,33,covid-19,2020-03-27
|
||||
covid-19 map,22,covid-19,2020-03-27
|
||||
covid-19 symptoms,22,covid-19,2020-03-27
|
||||
covid-19 news,20,covid-19,2020-03-27
|
||||
covid-19 updates,20,covid-19,2020-03-27
|
||||
covid-19 update,18,covid-19,2020-03-27
|
||||
covid-19 live,17,covid-19,2020-03-27
|
||||
corona virus,17,covid-19,2020-03-27
|
||||
covid-19 us,15,covid-19,2020-03-27
|
||||
who,14,covid-19,2020-03-27
|
||||
covid-19 who,14,covid-19,2020-03-27
|
||||
covid-19 canada,14,covid-19,2020-03-27
|
||||
what is covid-19,14,covid-19,2020-03-27
|
||||
covid-19 cdc,12,covid-19,2020-03-27
|
||||
cdc,11,covid-19,2020-03-27
|
||||
covid-19 test,11,covid-19,2020-03-27
|
||||
covid-19 italy,11,covid-19,2020-03-27
|
||||
covid-19 china,11,covid-19,2020-03-27
|
||||
covid-19 usa,10,covid-19,2020-03-27
|
||||
covid,100,covid19,2020-03-27
|
||||
covid 19,89,covid19,2020-03-27
|
||||
coronavirus covid19,66,covid19,2020-03-27
|
||||
coronavirus,65,covid19,2020-03-27
|
||||
covid19 cases,35,covid19,2020-03-27
|
||||
corona,28,covid19,2020-03-27
|
||||
corona covid19,27,covid19,2020-03-27
|
||||
covid19 virus,26,covid19,2020-03-27
|
||||
covid19 news,18,covid19,2020-03-27
|
||||
covid19 update,16,covid19,2020-03-27
|
||||
symptoms covid19,15,covid19,2020-03-27
|
||||
covid19 symptoms,15,covid19,2020-03-27
|
||||
covid19 map,15,covid19,2020-03-27
|
||||
corona virus,14,covid19,2020-03-27
|
||||
covid19 us,12,covid19,2020-03-27
|
||||
what is covid19,12,covid19,2020-03-27
|
||||
covid19 who,11,covid19,2020-03-27
|
||||
who,11,covid19,2020-03-27
|
||||
canada covid19,11,covid19,2020-03-27
|
||||
italy covid19,11,covid19,2020-03-27
|
||||
italy,10,covid19,2020-03-27
|
||||
test covid19,10,covid19,2020-03-27
|
||||
china covid19,10,covid19,2020-03-27
|
||||
covid19 uk,9,covid19,2020-03-27
|
||||
covid-19,9,covid19,2020-03-27
|
||||
coronavirus,100,sars-cov-2,2020-03-27
|
||||
coronavirus sars-cov-2,97,sars-cov-2,2020-03-27
|
||||
covid-19,80,sars-cov-2,2020-03-27
|
||||
covid-19 sars-cov-2,76,sars-cov-2,2020-03-27
|
||||
sars,49,sars-cov-2,2020-03-27
|
||||
virus sars-cov-2,43,sars-cov-2,2020-03-27
|
||||
corona,34,sars-cov-2,2020-03-27
|
||||
sars-cov-2 covid 19,32,sars-cov-2,2020-03-27
|
||||
covid 19,30,sars-cov-2,2020-03-27
|
||||
corona virus,17,sars-cov-2,2020-03-27
|
||||
sars-cov-2 vs covid-19,15,sars-cov-2,2020-03-27
|
||||
what is sars-cov-2,15,sars-cov-2,2020-03-27
|
||||
sars cov 2,13,sars-cov-2,2020-03-27
|
||||
sars-cov-2 wiki,8,sars-cov-2,2020-03-27
|
||||
koronawirus,7,sars-cov-2,2020-03-27
|
||||
cdc,6,sars-cov-2,2020-03-27
|
||||
sars-cov,6,sars-cov-2,2020-03-27
|
||||
sars-cov-1,6,sars-cov-2,2020-03-27
|
||||
sars-cov-2 vs cod-19,4,sars-cov-2,2020-03-27
|
||||
sars-cov-2 genome,3,sars-cov-2,2020-03-27
|
||||
pubmed,3,sars-cov-2,2020-03-27
|
||||
коронавирус,3,sars-cov-2,2020-03-27
|
||||
sars-cov-2 or covid-19,3,sars-cov-2,2020-03-27
|
||||
on the origin and continuing evolution of sars-cov-2,3,sars-cov-2,2020-03-27
|
||||
the proximal origin of sars-cov-2,3,sars-cov-2,2020-03-27
|
||||
coronavirus,100,covid-19 pandemic,2020-03-27
|
||||
covid-19 coronavirus pandemic,97,covid-19 pandemic,2020-03-27
|
||||
coronavirus pandemic,93,covid-19 pandemic,2020-03-27
|
||||
who,41,covid-19 pandemic,2020-03-27
|
||||
is covid-19 a pandemic,38,covid-19 pandemic,2020-03-27
|
||||
who pandemic,37,covid-19 pandemic,2020-03-27
|
||||
covid 19 pandemic,35,covid-19 pandemic,2020-03-27
|
||||
pandemic meaning,20,covid-19 pandemic,2020-03-27
|
||||
what is a pandemic,15,covid-19 pandemic,2020-03-27
|
||||
pandemic vs epidemic,14,covid-19 pandemic,2020-03-27
|
||||
cdc pandemic,14,covid-19 pandemic,2020-03-27
|
||||
pandemic definition,13,covid-19 pandemic,2020-03-27
|
||||
pandemic define,9,covid-19 pandemic,2020-03-27
|
||||
covid-19 pandemic plan,8,covid-19 pandemic,2020-03-27
|
||||
coronavirus pandemic covid-19 live world map/count,8,covid-19 pandemic,2020-03-27
|
||||
who declared covid-19 pandemic,7,covid-19 pandemic,2020-03-27
|
||||
covid-19 pandemic unemployment payment,7,covid-19 pandemic,2020-03-27
|
||||
coronavirus update,5,covid-19 pandemic,2020-03-27
|
||||
coronavirus update,100,coronavirus,2020-03-28
|
||||
corona,97,coronavirus,2020-03-28
|
||||
coronavirus symptoms,84,coronavirus,2020-03-28
|
||||
news coronavirus,73,coronavirus,2020-03-28
|
||||
coronavirus cases,70,coronavirus,2020-03-28
|
||||
coronavirus uk,53,coronavirus,2020-03-28
|
||||
corona virus,50,coronavirus,2020-03-28
|
||||
coronavirus map,43,coronavirus,2020-03-28
|
||||
china coronavirus,43,coronavirus,2020-03-28
|
||||
el coronavirus,43,coronavirus,2020-03-28
|
||||
coronavirus italia,40,coronavirus,2020-03-28
|
||||
india coronavirus,39,coronavirus,2020-03-28
|
||||
coronavirus france,38,coronavirus,2020-03-28
|
||||
coronavirus italy,31,coronavirus,2020-03-28
|
||||
sintomas coronavirus,31,coronavirus,2020-03-28
|
||||
coronavirus sintomas,31,coronavirus,2020-03-28
|
||||
coronavirus usa,30,coronavirus,2020-03-28
|
||||
us coronavirus,29,coronavirus,2020-03-28
|
||||
coronavirus españa,27,coronavirus,2020-03-28
|
||||
symptoms of coronavirus,26,coronavirus,2020-03-28
|
||||
coronavirus live,25,coronavirus,2020-03-28
|
||||
coronavirus tips,23,coronavirus,2020-03-28
|
||||
coronavirus in india,21,coronavirus,2020-03-28
|
||||
casos coronavirus,21,coronavirus,2020-03-28
|
||||
covid,21,coronavirus,2020-03-28
|
||||
coronavirus,100,covid-19,2020-03-28
|
||||
coronavirus covid-19,99,covid-19,2020-03-28
|
||||
covid,55,covid-19,2020-03-28
|
||||
covid-19 cases,51,covid-19,2020-03-28
|
||||
covid 19,39,covid-19,2020-03-28
|
||||
corona,34,covid-19,2020-03-28
|
||||
covid-19 virus,33,covid-19,2020-03-28
|
||||
covid-19 symptoms,25,covid-19,2020-03-28
|
||||
covid-19 map,24,covid-19,2020-03-28
|
||||
covid-19 news,20,covid-19,2020-03-28
|
||||
covid-19 updates,19,covid-19,2020-03-28
|
||||
covid-19 update,18,covid-19,2020-03-28
|
||||
covid-19 live,17,covid-19,2020-03-28
|
||||
corona virus,17,covid-19,2020-03-28
|
||||
covid-19 us,16,covid-19,2020-03-28
|
||||
covid-19 who,15,covid-19,2020-03-28
|
||||
what is covid-19,15,covid-19,2020-03-28
|
||||
canada covid-19,14,covid-19,2020-03-28
|
||||
who,14,covid-19,2020-03-28
|
||||
covid-19 canada,14,covid-19,2020-03-28
|
||||
cdc covid-19,12,covid-19,2020-03-28
|
||||
cdc,12,covid-19,2020-03-28
|
||||
covid-19 test,12,covid-19,2020-03-28
|
||||
covid-19 world,11,covid-19,2020-03-28
|
||||
china covid-19,11,covid-19,2020-03-28
|
||||
covid,100,covid19,2020-03-28
|
||||
covid 19,85,covid19,2020-03-28
|
||||
coronavirus covid19,62,covid19,2020-03-28
|
||||
coronavirus,61,covid19,2020-03-28
|
||||
covid19 cases,35,covid19,2020-03-28
|
||||
corona,28,covid19,2020-03-28
|
||||
covid19 virus,26,covid19,2020-03-28
|
||||
covid19 news,19,covid19,2020-03-28
|
||||
symptoms covid19,16,covid19,2020-03-28
|
||||
covid19 update,16,covid19,2020-03-28
|
||||
covid19 map,15,covid19,2020-03-28
|
||||
corona virus,14,covid19,2020-03-28
|
||||
covid19 us,12,covid19,2020-03-28
|
||||
what is covid19,11,covid19,2020-03-28
|
||||
covid19 who,11,covid19,2020-03-28
|
||||
who,11,covid19,2020-03-28
|
||||
covid19 canada,10,covid19,2020-03-28
|
||||
italy covid19,10,covid19,2020-03-28
|
||||
covid19 test,10,covid19,2020-03-28
|
||||
italy,10,covid19,2020-03-28
|
||||
china covid19,10,covid19,2020-03-28
|
||||
uk covid19,9,covid19,2020-03-28
|
||||
covid-19,9,covid19,2020-03-28
|
||||
usa covid19,8,covid19,2020-03-28
|
||||
covid19 deaths,8,covid19,2020-03-28
|
||||
coronavirus sars-cov-2,100,sars-cov-2,2020-03-28
|
||||
coronavirus,98,sars-cov-2,2020-03-28
|
||||
sars-cov-2 covid-19,78,sars-cov-2,2020-03-28
|
||||
covid-19,68,sars-cov-2,2020-03-28
|
||||
sars,46,sars-cov-2,2020-03-28
|
||||
virus sars-cov-2,43,sars-cov-2,2020-03-28
|
||||
corona,37,sars-cov-2,2020-03-28
|
||||
covid 19,36,sars-cov-2,2020-03-28
|
||||
sars-cov-2 covid 19,35,sars-cov-2,2020-03-28
|
||||
who,17,sars-cov-2,2020-03-28
|
||||
corona virus,15,sars-cov-2,2020-03-28
|
||||
covid19,13,sars-cov-2,2020-03-28
|
||||
sars-cov-2 vs covid-19,13,sars-cov-2,2020-03-28
|
||||
what is sars-cov-2,13,sars-cov-2,2020-03-28
|
||||
sars cov 2,11,sars-cov-2,2020-03-28
|
||||
sars-cov-2 wiki,9,sars-cov-2,2020-03-28
|
||||
koronawirus,7,sars-cov-2,2020-03-28
|
||||
sars-cov,7,sars-cov-2,2020-03-28
|
||||
mers,7,sars-cov-2,2020-03-28
|
||||
sars-cov-2 symptoms,7,sars-cov-2,2020-03-28
|
||||
pubmed,4,sars-cov-2,2020-03-28
|
||||
sars-cov-2 vs cod-19,4,sars-cov-2,2020-03-28
|
||||
on the origin and continuing evolution of sars-cov-2,3,sars-cov-2,2020-03-28
|
||||
the proximal origin of sars-cov-2,3,sars-cov-2,2020-03-28
|
||||
sars2,3,sars-cov-2,2020-03-28
|
||||
coronavirus pandemic,100,covid-19 pandemic,2020-03-28
|
||||
coronavirus,97,covid-19 pandemic,2020-03-28
|
||||
covid-19 coronavirus pandemic,93,covid-19 pandemic,2020-03-28
|
||||
is covid-19 a pandemic,51,covid-19 pandemic,2020-03-28
|
||||
who pandemic,43,covid-19 pandemic,2020-03-28
|
||||
who,40,covid-19 pandemic,2020-03-28
|
||||
covid 19,33,covid-19 pandemic,2020-03-28
|
||||
covid 19 pandemic,33,covid-19 pandemic,2020-03-28
|
||||
epidemic,25,covid-19 pandemic,2020-03-28
|
||||
pandemic meaning,21,covid-19 pandemic,2020-03-28
|
||||
cdc,17,covid-19 pandemic,2020-03-28
|
||||
what is a pandemic,15,covid-19 pandemic,2020-03-28
|
||||
pandemic definition,12,covid-19 pandemic,2020-03-28
|
||||
covid-19 symptoms,11,covid-19 pandemic,2020-03-28
|
||||
coronavirus pandemic covid-19 live world map/count,8,covid-19 pandemic,2020-03-28
|
||||
covid-19 updates,7,covid-19 pandemic,2020-03-28
|
||||
covid-19 pandemic unemployment payment,6,covid-19 pandemic,2020-03-28
|
||||
covid-19 pandemic unemployment,5,covid-19 pandemic,2020-03-28
|
||||
pandemic vs endemic,4,covid-19 pandemic,2020-03-28
|
||||
who declared covid-19 pandemic,4,covid-19 pandemic,2020-03-28
|
||||
when was the last pandemic,3,covid-19 pandemic,2020-03-28
|
|
2528
transliterations/data/output/wikidata_search_results.csv
Normal file
2528
transliterations/data/output/wikidata_search_results.csv
Normal file
File diff suppressed because it is too large
Load Diff
79671
transliterations/data/output/wikidata_search_results_from_gtrends.csv
Normal file
79671
transliterations/data/output/wikidata_search_results_from_gtrends.csv
Normal file
File diff suppressed because it is too large
Load Diff
2
transliterations/requirements.txt
Normal file
2
transliterations/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
pytrends>=4.7.2
|
||||
mwapi>=0.5.1
|
2
transliterations/src/__init__.py
Normal file
2
transliterations/src/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from wikidata_api_calls import *
|
||||
from find_entities import *
|
76
transliterations/src/collect_trends.py
Normal file
76
transliterations/src/collect_trends.py
Normal file
@ -0,0 +1,76 @@
|
||||
# this follows a similar approach to nick's trends.js but in python
|
||||
from pytrends.request import TrendReq
|
||||
from datetime import datetime
|
||||
from os import path
|
||||
import csv
|
||||
from itertools import islice, chain, zip_longest
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# from itertools recipes
|
||||
#https://docs.python.org/3.6/library/itertools.html#itertools-recipes
|
||||
def grouper(iterable, n, fillvalue=None):
|
||||
"Collect data into fixed-length chunks or blocks"
|
||||
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
|
||||
args = [iter(iterable)] * n
|
||||
return zip_longest(*args, fillvalue=fillvalue)
|
||||
|
||||
def get_daily_trends():
|
||||
trendReq = TrendReq(backoff_factor=0.2)
|
||||
today_trending = trendReq.today_searches()
|
||||
daily_trends_outfile = path.join("..","data","output","daily_google_trends.csv")
|
||||
|
||||
write_header = False
|
||||
header = ['date','term','top']
|
||||
|
||||
if not path.exists(daily_trends_outfile):
|
||||
write_header = True
|
||||
|
||||
with open("../data/output/daily_google_trends.csv",'a',newline='') as of:
|
||||
writer = csv.writer(of)
|
||||
if write_header:
|
||||
writer.writerow(header)
|
||||
|
||||
for i, trend in enumerate(today_trending):
|
||||
writer.writerow([str(datetime.now().date()),trend,i])
|
||||
|
||||
def get_related_queries(stems):
|
||||
# we have to batch these in sets of 5
|
||||
trendReq = TrendReq(backoff_factor=0.2)
|
||||
def _get_related_queries(chunk):
|
||||
kw_list = list(filter(lambda x: x is not None, chunk))
|
||||
trendReq.build_payload(kw_list=kw_list)
|
||||
related_queries = trendReq.related_queries()
|
||||
for term, results in related_queries.items():
|
||||
for key, df in results.items():
|
||||
if df is not None:
|
||||
df["term"] = term
|
||||
yield (key,df)
|
||||
|
||||
l = chain(*map(_get_related_queries, grouper(stems,5)))
|
||||
out = {}
|
||||
for key, value in l:
|
||||
if key in out:
|
||||
out[key].append(value)
|
||||
else:
|
||||
out[key] = [value]
|
||||
|
||||
for k in out.keys():
|
||||
df = pd.concat(out[k])
|
||||
df['date'] = str(datetime.now().date())
|
||||
out[k] = df
|
||||
outfile = path.join('..','data','output',f"related_searches_{k}.csv")
|
||||
if path.exists(outfile):
|
||||
mode = 'a'
|
||||
header = False
|
||||
else:
|
||||
mode = 'w'
|
||||
header = True
|
||||
|
||||
df.to_csv(outfile, mode=mode, header=header,index=False)
|
||||
|
||||
stems = [t.strip() for t in open("../data/input/base_terms.txt",'r')]
|
||||
|
||||
get_daily_trends()
|
||||
|
||||
get_related_queries(stems)
|
16
transliterations/src/compile_transliterated_phrases.sh
Executable file
16
transliterations/src/compile_transliterated_phrases.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
# For now these scripts don't accept command line arguments. It's an MVP
|
||||
|
||||
echo "Reading Google trends"
|
||||
python3 collect_trends.py
|
||||
|
||||
echo "Searching for Wikidata entities using base_terms.txt"
|
||||
python3 wikidata_search.py ../data/input/base_terms.txt --output ../data/output/wikidata_search_results.csv
|
||||
|
||||
echo "Searching for Wikidata entities using Google trends"
|
||||
python3 wikidata_search.py ../data/output/related_searches_rising.csv ../data/output/related_searches_top.csv --use-gtrends --output ../data/output/wikidata_search_results_from_gtrends.csv
|
||||
|
||||
echo "Finding transliterations from Wikidata using sparql"
|
||||
python3 wikidata_transliterations.py ../data/output/wikidata_search_results_from_gtrends.csv ../data/output/wikidata_search_results.csv --topN 10 20 --output ../data/output/$(date '+%Y-%m-%d')_wikidata_entity_labels.csv
|
||||
|
1
transliterations/src/defaults.py
Normal file
1
transliterations/src/defaults.py
Normal file
@ -0,0 +1 @@
|
||||
user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)"
|
35
transliterations/src/wikidata_api_calls.py
Normal file
35
transliterations/src/wikidata_api_calls.py
Normal file
@ -0,0 +1,35 @@
|
||||
# File defines functions for making api calls to find translations and transliterations for key terms.
|
||||
import mwapi
|
||||
import requests
|
||||
import sys
|
||||
import time
|
||||
from defaults import user_agent
|
||||
|
||||
def get_wikidata_api():
|
||||
session = mwapi.Session(host="https://wikidata.org/w/api.php", user_agent=user_agent)
|
||||
return session
|
||||
|
||||
def search_wikidata(session, term, *args, **kwargs):
|
||||
search_results = session.get(action='query',
|
||||
list='search',
|
||||
srsearch=term,
|
||||
# srqiprofile='popular_inclinks_pv',
|
||||
srlimit='max',
|
||||
srnamespace=0,
|
||||
*args,
|
||||
**kwargs)
|
||||
|
||||
|
||||
query = search_results.get('query', None)
|
||||
results = query.get('search', None)
|
||||
|
||||
if results is None:
|
||||
raise mwapi.session.APIError(f"No results for query: {term}")
|
||||
|
||||
return results
|
||||
|
||||
def run_sparql_query(q):
|
||||
results = requests.get("https://query.wikidata.org/bigdata/namespace/wdq/sparql",params={"format":"json","query":q})
|
||||
time.sleep(2)
|
||||
return results
|
||||
|
95
transliterations/src/wikidata_search.py
Normal file
95
transliterations/src/wikidata_search.py
Normal file
@ -0,0 +1,95 @@
|
||||
# generate a list of wikidata entities related to keywords
|
||||
from os import path
|
||||
from sys import stdout
|
||||
from wikidata_api_calls import search_wikidata, get_wikidata_api
|
||||
import csv
|
||||
from itertools import chain
|
||||
|
||||
class Wikidata_ResultSet:
|
||||
def __init__(self):
|
||||
self.results = []
|
||||
|
||||
def extend(self, term, results):
|
||||
self.results.append(
|
||||
(Wikidata_Result(term, result, i)
|
||||
for i, result in enumerate(results))
|
||||
)
|
||||
|
||||
def to_csv(self, outfile=None, mode='w'):
|
||||
if outfile is None:
|
||||
of = stdout
|
||||
|
||||
else:
|
||||
if path.exists(outfile) and mode != 'w':
|
||||
of = open(outfile,'a',newline='')
|
||||
else:
|
||||
of = open(outfile,'w',newline='')
|
||||
writer = csv.writer(of)
|
||||
writer.writerow(Wikidata_Result.__slots__)
|
||||
writer.writerows(map(Wikidata_Result.to_list, chain(* self.results)))
|
||||
|
||||
|
||||
class Wikidata_Result:
|
||||
# store unique entities found in the search results, the position in the search result, and the date
|
||||
__slots__=['search_term','entityid','pageid','search_position','timestamp']
|
||||
|
||||
def __init__(self,
|
||||
term,
|
||||
search_result,
|
||||
position):
|
||||
|
||||
self.search_term = term.strip()
|
||||
self.entityid = search_result['title']
|
||||
self.pageid = int(search_result['pageid'])
|
||||
self.search_position = int(position)
|
||||
self.timestamp = search_result['timestamp']
|
||||
|
||||
def to_list(self):
|
||||
return [self.search_term,
|
||||
self.entityid,
|
||||
self.pageid,
|
||||
self.search_position,
|
||||
self.timestamp]
|
||||
|
||||
def run_wikidata_searches(terms):
|
||||
api = get_wikidata_api()
|
||||
resultset = Wikidata_ResultSet()
|
||||
for term in terms:
|
||||
search_results = search_wikidata(api, term)
|
||||
resultset.extend(term, search_results)
|
||||
return resultset
|
||||
|
||||
def read_google_trends_files(terms_files):
|
||||
def _read_file(infile):
|
||||
return csv.DictReader(open(infile,'r',newline=''))
|
||||
|
||||
for row in chain(* [_read_file(terms_file) for terms_file in terms_files]):
|
||||
yield row['query']
|
||||
|
||||
|
||||
def trawl_google_trends(terms_files, outfile = None, mode='w'):
|
||||
terms = list(read_google_trends_files(terms_files))
|
||||
resultset = run_wikidata_searches(terms)
|
||||
resultset.to_csv(outfile, mode)
|
||||
|
||||
def trawl_base_terms(infiles, outfile = None, mode='w'):
|
||||
terms = list(chain(* (open(infile,'r') for infile in infiles)))
|
||||
resultset = run_wikidata_searches(terms)
|
||||
resultset.to_csv(outfile, mode)
|
||||
|
||||
## search each of the base terms in wikidata
|
||||
|
||||
# store unique entities found in the search results, the position in the search result, and the date
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser("Search wikidata for entities related to a set of terms.")
|
||||
parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read')
|
||||
parser.add_argument('--use-gtrends', action='store_true', help = 'toggle whether the input is the output from google trends')
|
||||
parser.add_argument('--output', type=str, help='an output file. defaults to stdout')
|
||||
parser.add_argument('--overwrite', action='store_true', help = 'overwrite existing output files instead of appending')
|
||||
args = parser.parse_args()
|
||||
if args.use_gtrends:
|
||||
trawl_google_trends(args.inputs, args.output)
|
||||
else:
|
||||
trawl_base_terms(args.inputs, args.output)
|
96
transliterations/src/wikidata_transliterations.py
Normal file
96
transliterations/src/wikidata_transliterations.py
Normal file
@ -0,0 +1,96 @@
|
||||
from wikidata_api_calls import run_sparql_query
|
||||
from itertools import chain, islice
|
||||
import csv
|
||||
from json import JSONDecodeError
|
||||
|
||||
class LabelData:
|
||||
__slots__ = ['entityid','label','langcode','is_alt']
|
||||
|
||||
def __init__(self, wd_res, is_alt):
|
||||
obj = wd_res.get('label',None)
|
||||
self.label = obj.get('value',None)
|
||||
self.langcode = obj.get('xml:lang',None)
|
||||
self.entityid = wd_res.get('entity',None).get('value',None)
|
||||
self.is_alt = is_alt
|
||||
|
||||
def to_list(self):
|
||||
return [self.entityid,
|
||||
self.label,
|
||||
self.langcode,
|
||||
self.is_alt]
|
||||
|
||||
def GetAllLabels(in_csvs, outfile, topNs):
|
||||
|
||||
def load_entity_ids(in_csv, topN=5):
|
||||
with open(in_csv,'r',newline='') as infile:
|
||||
reader = list(csv.DictReader(infile))
|
||||
for row in reader:
|
||||
if int(row['search_position']) < topN:
|
||||
yield row["entityid"]
|
||||
|
||||
ids = set(chain(* map(lambda in_csv, topN: load_entity_ids(in_csv, topN), in_csvs, topNs)))
|
||||
|
||||
labeldata = GetEntityLabels(ids)
|
||||
|
||||
with open(outfile, 'w', newline='') as of:
|
||||
writer = csv.writer(of)
|
||||
writer.writerow(LabelData.__slots__)
|
||||
writer.writerows(map(LabelData.to_list,labeldata))
|
||||
|
||||
|
||||
def GetEntityLabels(entityids):
|
||||
|
||||
def run_query_and_parse(query, is_alt):
|
||||
results = run_sparql_query(query)
|
||||
try:
|
||||
jobj = results.json()
|
||||
|
||||
res = jobj.get('results',None)
|
||||
if res is not None:
|
||||
res = res.get('bindings',None)
|
||||
if res is None:
|
||||
raise requests.APIError(f"got invalid response from wikidata for {query % entityid}")
|
||||
|
||||
for info in res:
|
||||
yield LabelData(info, is_alt)
|
||||
|
||||
except JSONDecodeError as e:
|
||||
print(e)
|
||||
print(query)
|
||||
|
||||
def prep_query(query, prop, entityids):
|
||||
values = ' '.join(('wd:{0}'.format(id) for id in entityids))
|
||||
return query.format(prop, values)
|
||||
|
||||
base_query = """
|
||||
SELECT DISTINCT ?entity ?label WHERE {{
|
||||
?entity {0} ?label;
|
||||
VALUES ?entity {{ {1} }}
|
||||
}}"""
|
||||
|
||||
# we can't get all the entities at once. how about 100 at a time?
|
||||
chunksize = 100
|
||||
entityids = (id for id in entityids)
|
||||
chunk = list(islice(entityids, chunksize))
|
||||
calls = []
|
||||
while len(chunk) > 0:
|
||||
label_query = prep_query(base_query, "rdfs:label", chunk)
|
||||
altLabel_query = prep_query(base_query, "skos:altLabel", chunk)
|
||||
label_results = run_query_and_parse(label_query, is_alt=False)
|
||||
altLabel_results = run_query_and_parse(altLabel_query, is_alt=True)
|
||||
calls.extend([label_results, altLabel_results])
|
||||
chunk = list(islice(entityids, chunksize))
|
||||
|
||||
return chain(*calls)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser("Use wikidata to find transliterations of terms")
|
||||
parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read. the inputs are generated by wikidata_search.py')
|
||||
parser.add_argument('--topN', type=int, nargs='+', help='limit number of wikidata search results to use, can pass one arg for each source.')
|
||||
parser.add_argument('--output', type=str, help='an output file. defaults to stdout',default=20)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
GetAllLabels(args.inputs, args.output, topNs=args.topN)
|
@ -10,12 +10,6 @@
|
||||
###############################################################################
|
||||
|
||||
|
||||
#1 Load up the list of article names
|
||||
|
||||
#2 Repeatedly call the API with that list of names
|
||||
|
||||
#3 Save results as a TSV
|
||||
|
||||
import requests
|
||||
import argparse
|
||||
import json
|
||||
@ -26,7 +20,6 @@ import datetime
|
||||
#import feather
|
||||
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Call the views API repeatedly.')
|
||||
@ -55,31 +48,32 @@ def main():
|
||||
|
||||
|
||||
articleList = []
|
||||
with open(articleFile, 'r') as infileHandle:
|
||||
#theInfile = csv.reader(infileHandle, quotechar='"')
|
||||
theInfile = csv.reader(infileHandle)
|
||||
next(theInfile) #skip header
|
||||
for currentLine in theInfile:
|
||||
articleList.append(currentLine)
|
||||
#1 Load up the list of article names
|
||||
|
||||
j_Out = outputPath + "dailyviews" + queryDate + ".json"
|
||||
t_Out = outputPath + "dailyviews" + queryDate + ".tsv"
|
||||
j_Out = f"{outputPath}dailyviews{queryDate}.json"
|
||||
t_Out = f"{outputPath}dailyviews{queryDate}.tsv"
|
||||
|
||||
with open(articleFile, 'r') as infile:
|
||||
next(infile) #skip header
|
||||
articleList = list(infile)
|
||||
|
||||
j = []
|
||||
|
||||
i = 0 #iterator to deal with end of file
|
||||
#2 Repeatedly call the API with that list of names
|
||||
|
||||
for a in articleList:
|
||||
a = a[0] #destringify
|
||||
i = i+1
|
||||
url= "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/"
|
||||
url= url + a + "/daily/" + queryDate + "/" + queryDate #for now, single date at a time
|
||||
a = a.strip("\"\n") #destringify
|
||||
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}/{queryDate}"
|
||||
|
||||
response = requests.get(url)
|
||||
if response.ok:
|
||||
jd = json.loads(response.content)
|
||||
j.append(jd["items"][0])
|
||||
time.sleep(.1)
|
||||
else:
|
||||
print(f"Not ok response: {response.status_code} from {url}")
|
||||
|
||||
#3 Save results as a JSON and TSV
|
||||
|
||||
#all data in j now, make json file
|
||||
with open(j_Out, 'w') as j_outfile:
|
||||
@ -91,7 +85,7 @@ def main():
|
||||
dw.writerows(j)
|
||||
|
||||
|
||||
f_Out = outputPath + "dailyviews" + queryDate + ".feather"
|
||||
# f_Out = outputPath + "dailyviews" + queryDate + ".feather"
|
||||
# read the json back in and make a feather file?
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user