diff --git a/src/lib/expand_dumps.ipynb b/src/lib/expand_dumps.ipynb new file mode 100644 index 0000000..93d161f --- /dev/null +++ b/src/lib/expand_dumps.ipynb @@ -0,0 +1,18 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_history_dumps/Python_Pandas_examples\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/lib/get_dumps.py b/src/lib/get_dumps.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lib/get_wiki_data.ipynb b/src/lib/get_wiki_data.ipynb deleted file mode 100644 index 341c86d..0000000 --- a/src/lib/get_wiki_data.ipynb +++ /dev/null @@ -1,117 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'pandas'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_1566937/562128272.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pandas'" - ] - } - ], - "source": [ - "import bz2\n", - "import os\n", - "import re\n", - "import json\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'findspark'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_1566937/3416374666.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mfindspark\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mfindspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menviron\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'HADOOP_CONF_DIR'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSparkSession\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mfunctions\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mWindow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'findspark'" - ] - } - ], - "source": [ - "import findspark\n", - "findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n", - "os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql import functions as F, types as T, Window\n", - "\n", - "#source /opt/conda-analytics/bin/activate\n", - "import wmfdata.spark as wmfspark" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'wmfspark' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_1566937/1742447738.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## defining the spark session\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mspark_config\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m spark = wmfspark.create_session(\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mapp_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Pyspark notebook'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'regular'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'wmfspark' is not defined" - ] - } - ], - "source": [ - "## defining the spark session\n", - "spark_config = {}\n", - "spark = wmfspark.create_session(\n", - " app_name='Pyspark notebook', \n", - " type='regular'\n", - "# extra_settings=spark_config\n", - ")\n", - "spark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"success\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/src/lib/get_wiki_activity.py b/src/lib/old_wiki_approach/get_wiki_activity.py similarity index 80% rename from src/lib/get_wiki_activity.py rename to src/lib/old_wiki_approach/get_wiki_activity.py index 1b33e25..353deee 100644 --- a/src/lib/get_wiki_activity.py +++ b/src/lib/old_wiki_approach/get_wiki_activity.py @@ -8,6 +8,10 @@ import sys import findspark #findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2") os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2" +os.environ['PYSPARK_PYTHON'] = sys.executable +os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable +os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64" +os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre" from pyspark.sql import SparkSession from pyspark.sql import functions as F, types as T, Window @@ -42,4 +46,4 @@ df_projects.show() # TODO get the # page creation and page edit -# events of those bots for each of the wikis \ No newline at end of file +# events of those bots for each of the wikis diff --git a/src/lib/old_wiki_approach/get_wiki_data.ipynb b/src/lib/old_wiki_approach/get_wiki_data.ipynb new file mode 100644 index 0000000..c8556ca --- /dev/null +++ b/src/lib/old_wiki_approach/get_wiki_data.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import bz2\n", + "import os\n", + "import re\n", + "import json\n", + "import pandas as pd\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import findspark\n", + "#findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n", + "os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n", + "os.environ['PYSPARK_PYTHON'] = sys.executable\n", + "os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable\n", + "os.environ['JAVA_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n", + "os.environ['JRE_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64/jre\"\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import functions as F, types as T, Window\n", + "\n", + "#source /opt/conda-analytics/bin/activate\n", + "import wmfdata.spark as wmfspark" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'spark' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m df_projects \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m----> 2\u001b[0m \u001b[43mspark\u001b[49m\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mtable(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwmf_raw.mediawiki_project_namespace_map\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;241m.\u001b[39mwhere(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msnapshot\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m==\u001b[39msnapshot)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;241m.\u001b[39mwhere(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhostname\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwikipedia\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 5\u001b[0m \u001b[38;5;241m.\u001b[39mselect(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdbname\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39malias(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 6\u001b[0m \u001b[38;5;241m.\u001b[39mdistinct()\n\u001b[1;32m 7\u001b[0m \u001b[38;5;241m.\u001b[39morderBy(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m )\n\u001b[1;32m 9\u001b[0m df_projects\u001b[38;5;241m.\u001b[39mshow()\n", + "\u001b[0;31mNameError\u001b[0m: name 'spark' is not defined" + ] + } + ], + "source": [ + "df_projects = (\n", + " spark.read.table(\"wmf_raw.mediawiki_project_namespace_map\")\n", + " .where(F.col(\"snapshot\")==snapshot)\n", + " .where(F.col(\"hostname\").contains(\"wikipedia\"))\n", + " .select(F.col(\"dbname\").alias(\"wiki_db\"))\n", + " .distinct()\n", + " .orderBy(\"wiki_db\")\n", + ")\n", + "df_projects.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/main.py b/src/main.py index af0f71c..8ce50db 100644 --- a/src/main.py +++ b/src/main.py @@ -3,4 +3,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()