1
0

pivot for collecting dump data

This commit is contained in:
Matthew Gaughan 2024-12-17 15:52:29 -06:00
parent 4eb0b70608
commit db6b140748
6 changed files with 114 additions and 119 deletions

View File

@ -0,0 +1,18 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_history_dumps/Python_Pandas_examples\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0
src/lib/get_dumps.py Normal file
View File

View File

@ -1,117 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pandas'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_1566937/562128272.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pandas'"
]
}
],
"source": [
"import bz2\n",
"import os\n",
"import re\n",
"import json\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'findspark'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_1566937/3416374666.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mfindspark\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mfindspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menviron\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'HADOOP_CONF_DIR'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSparkSession\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mfunctions\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mWindow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'findspark'"
]
}
],
"source": [
"import findspark\n",
"findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n",
"os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql import functions as F, types as T, Window\n",
"\n",
"#source /opt/conda-analytics/bin/activate\n",
"import wmfdata.spark as wmfspark"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'wmfspark' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_1566937/1742447738.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## defining the spark session\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mspark_config\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m spark = wmfspark.create_session(\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mapp_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Pyspark notebook'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'regular'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'wmfspark' is not defined"
]
}
],
"source": [
"## defining the spark session\n",
"spark_config = {}\n",
"spark = wmfspark.create_session(\n",
" app_name='Pyspark notebook', \n",
" type='regular'\n",
"# extra_settings=spark_config\n",
")\n",
"spark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"success\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -8,6 +8,10 @@ import sys
import findspark import findspark
#findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2") #findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2")
os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2" os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre"
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window from pyspark.sql import functions as F, types as T, Window

View File

@ -0,0 +1,90 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import bz2\n",
"import os\n",
"import re\n",
"import json\n",
"import pandas as pd\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import findspark\n",
"#findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n",
"os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n",
"os.environ['PYSPARK_PYTHON'] = sys.executable\n",
"os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable\n",
"os.environ['JAVA_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n",
"os.environ['JRE_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64/jre\"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql import functions as F, types as T, Window\n",
"\n",
"#source /opt/conda-analytics/bin/activate\n",
"import wmfdata.spark as wmfspark"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'spark' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m df_projects \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m----> 2\u001b[0m \u001b[43mspark\u001b[49m\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mtable(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwmf_raw.mediawiki_project_namespace_map\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;241m.\u001b[39mwhere(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msnapshot\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m==\u001b[39msnapshot)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;241m.\u001b[39mwhere(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhostname\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwikipedia\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 5\u001b[0m \u001b[38;5;241m.\u001b[39mselect(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdbname\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39malias(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 6\u001b[0m \u001b[38;5;241m.\u001b[39mdistinct()\n\u001b[1;32m 7\u001b[0m \u001b[38;5;241m.\u001b[39morderBy(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m )\n\u001b[1;32m 9\u001b[0m df_projects\u001b[38;5;241m.\u001b[39mshow()\n",
"\u001b[0;31mNameError\u001b[0m: name 'spark' is not defined"
]
}
],
"source": [
"df_projects = (\n",
" spark.read.table(\"wmf_raw.mediawiki_project_namespace_map\")\n",
" .where(F.col(\"snapshot\")==snapshot)\n",
" .where(F.col(\"hostname\").contains(\"wikipedia\"))\n",
" .select(F.col(\"dbname\").alias(\"wiki_db\"))\n",
" .distinct()\n",
" .orderBy(\"wiki_db\")\n",
")\n",
"df_projects.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}