1
0

wip getting wmf spark up, on ice for now

This commit is contained in:
Matthew Gaughan 2024-12-12 13:34:46 -06:00
parent 6cb3296e8e
commit 3c04d1ced5
3 changed files with 237 additions and 2 deletions

View File

@ -0,0 +1,92 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import bz2\n",
"import os\n",
"import re\n",
"import json\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import findspark\n",
"findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n",
"os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql import functions as F, types as T, Window\n",
"\n",
"#source /opt/conda-analytics/bin/activate\n",
"import wmfdata.spark as wmfspark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"24/12/12 11:55:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
}
],
"source": [
"## defining the spark session\n",
"spark_config = {}\n",
"spark = wmfspark.create_session(\n",
" app_name='Pyspark notebook', \n",
" type='regular'\n",
"# extra_settings=spark_config\n",
")\n",
"spark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"success\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -3,13 +3,39 @@ import os
import re import re
import json import json
import pandas as pd import pandas as pd
import sys
import findspark import findspark
findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2") #findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2")
os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2"
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window from pyspark.sql import functions as F, types as T, Window
#breaking
#source /opt/conda-analytics/bin/activate
import wmfdata.spark as wmfspark import wmfdata.spark as wmfspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
#source /opt/conda-analytics/bin/activate
import wmfdata.spark as wmfspark
## defining the spark session
spark_config = {}
spark = wmfspark.create_session(
type="local"
)
print(spark)
df_projects = (
spark.read.table("wmf_raw.mediawiki_project_namespace_map")
.where(F.col("snapshot")==snapshot)
.where(F.col("hostname").contains("wikipedia"))
.select(F.col("dbname").alias("wiki_db"))
.distinct()
.orderBy("wiki_db")
)
df_projects.show()
# TODO Get a list of bots in the project # TODO Get a list of bots in the project
# TODO get all mws wikis # TODO get all mws wikis

117
src/lib/get_wiki_data.ipynb Normal file
View File

@ -0,0 +1,117 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pandas'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_1566937/562128272.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pandas'"
]
}
],
"source": [
"import bz2\n",
"import os\n",
"import re\n",
"import json\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'findspark'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_1566937/3416374666.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mfindspark\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mfindspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menviron\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'HADOOP_CONF_DIR'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSparkSession\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mfunctions\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mWindow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'findspark'"
]
}
],
"source": [
"import findspark\n",
"findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n",
"os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql import functions as F, types as T, Window\n",
"\n",
"#source /opt/conda-analytics/bin/activate\n",
"import wmfdata.spark as wmfspark"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'wmfspark' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_1566937/1742447738.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## defining the spark session\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mspark_config\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m spark = wmfspark.create_session(\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mapp_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Pyspark notebook'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'regular'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'wmfspark' is not defined"
]
}
],
"source": [
"## defining the spark session\n",
"spark_config = {}\n",
"spark = wmfspark.create_session(\n",
" app_name='Pyspark notebook', \n",
" type='regular'\n",
"# extra_settings=spark_config\n",
")\n",
"spark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"success\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}