diff --git a/src/lib/.ipynb_checkpoints/get_wiki_data-checkpoint.ipynb b/src/lib/.ipynb_checkpoints/get_wiki_data-checkpoint.ipynb new file mode 100644 index 0000000..c70bb30 --- /dev/null +++ b/src/lib/.ipynb_checkpoints/get_wiki_data-checkpoint.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import bz2\n", + "import os\n", + "import re\n", + "import json\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import findspark\n", + "findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n", + "os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import functions as F, types as T, Window\n", + "\n", + "#source /opt/conda-analytics/bin/activate\n", + "import wmfdata.spark as wmfspark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "24/12/12 11:55:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + } + ], + "source": [ + "## defining the spark session\n", + "spark_config = {}\n", + "spark = wmfspark.create_session(\n", + " app_name='Pyspark notebook', \n", + " type='regular'\n", + "# extra_settings=spark_config\n", + ")\n", + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"success\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/lib/get_wiki_activity.py b/src/lib/get_wiki_activity.py index 54a882c..1b33e25 100644 --- a/src/lib/get_wiki_activity.py +++ b/src/lib/get_wiki_activity.py @@ -3,13 +3,39 @@ import os import re import json import pandas as pd +import sys import findspark -findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2") +#findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2") +os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2" from pyspark.sql import SparkSession from pyspark.sql import functions as F, types as T, Window -#breaking + +#source /opt/conda-analytics/bin/activate import wmfdata.spark as wmfspark +from pyspark.sql import SparkSession +from pyspark.sql import functions as F, types as T, Window + +#source /opt/conda-analytics/bin/activate +import wmfdata.spark as wmfspark + +## defining the spark session + +spark_config = {} +spark = wmfspark.create_session( + type="local" +) +print(spark) +df_projects = ( + spark.read.table("wmf_raw.mediawiki_project_namespace_map") + .where(F.col("snapshot")==snapshot) + .where(F.col("hostname").contains("wikipedia")) + .select(F.col("dbname").alias("wiki_db")) + .distinct() + .orderBy("wiki_db") +) +df_projects.show() + # TODO Get a list of bots in the project # TODO get all mws wikis diff --git a/src/lib/get_wiki_data.ipynb b/src/lib/get_wiki_data.ipynb new file mode 100644 index 0000000..341c86d --- /dev/null +++ b/src/lib/get_wiki_data.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'pandas'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_1566937/562128272.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pandas'" + ] + } + ], + "source": [ + "import bz2\n", + "import os\n", + "import re\n", + "import json\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'findspark'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_1566937/3416374666.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mfindspark\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mfindspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menviron\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'HADOOP_CONF_DIR'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSparkSession\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mfunctions\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mWindow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'findspark'" + ] + } + ], + "source": [ + "import findspark\n", + "findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n", + "os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import functions as F, types as T, Window\n", + "\n", + "#source /opt/conda-analytics/bin/activate\n", + "import wmfdata.spark as wmfspark" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'wmfspark' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_1566937/1742447738.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## defining the spark session\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mspark_config\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m spark = wmfspark.create_session(\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mapp_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Pyspark notebook'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'regular'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'wmfspark' is not defined" + ] + } + ], + "source": [ + "## defining the spark session\n", + "spark_config = {}\n", + "spark = wmfspark.create_session(\n", + " app_name='Pyspark notebook', \n", + " type='regular'\n", + "# extra_settings=spark_config\n", + ")\n", + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"success\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}