459 lines
		
	
	
		
			38 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			459 lines
		
	
	
		
			38 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
{
 | 
						|
 "cells": [
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 3,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "import re\n",
 | 
						|
    "import os\n",
 | 
						|
    "from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, IntegerType, ArrayType\n",
 | 
						|
    "from pyspark.sql.functions import count, lit, desc\n",
 | 
						|
    "from pyspark.sql import SparkSession"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 4,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "os.environ['JAVA_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n",
 | 
						|
    "os.environ['JRE_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64/jre\""
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 5,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "mediawiki_history_path = \"/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/yearly_activity_files/\""
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 6,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "# Note: string unescaping and array conversion is done later\n",
 | 
						|
    "mediawiki_history_schema = StructType([\n",
 | 
						|
    "\n",
 | 
						|
    "    StructField(\"wiki_db\", StringType(), nullable = False),\n",
 | 
						|
    "    StructField(\"event_entity\", StringType(), nullable = False),\n",
 | 
						|
    "    StructField(\"event_type\", StringType(), nullable = False),\n",
 | 
						|
    "    StructField(\"event_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_comment_escaped\", StringType(), nullable = True),\n",
 | 
						|
    "\n",
 | 
						|
    "    StructField(\"event_user_id\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_text_historical_escaped\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_text_escaped\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_blocks_historical_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_blocks_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_groups_historical_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_groups_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_is_bot_by_historical_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_is_bot_by_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_is_created_by_self\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_is_created_by_system\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_is_created_by_peer\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_is_anonymous\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_registration_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_creation_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_first_edit_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_revision_count\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"event_user_seconds_since_previous_revision\", LongType(), nullable = True),\n",
 | 
						|
    "\n",
 | 
						|
    "    StructField(\"page_id\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_title_historical_escaped\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_title_escaped\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_namespace_historical\", IntegerType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_namespace_is_content_historical\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_namespace\", IntegerType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_namespace_is_content\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_is_redirect\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_is_deleted\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_creation_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_first_edit_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_revision_count\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"page_seconds_since_previous_revision\", LongType(), nullable = True),\n",
 | 
						|
    "\n",
 | 
						|
    "    StructField(\"user_id\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_text_historical_escaped\",  StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_text_escaped\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_blocks_historical_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_blocks_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_groups_historical_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_groups_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_is_bot_by_historical_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_is_bot_by_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_is_created_by_self\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_is_created_by_system\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_is_created_by_peer\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_is_anonymous\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_registration_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_creation_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"user_first_edit_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "\n",
 | 
						|
    "    StructField(\"revision_id\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_parent_id\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_minor_edit\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_deleted_parts_string\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_deleted_parts_are_suppressed\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_text_bytes\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_text_bytes_diff\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_text_sha1\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_content_model\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_content_format\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_is_deleted_by_page_deletion\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_deleted_by_page_deletion_timestamp\", StringType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_is_identity_reverted\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_first_identity_reverting_revision_id\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_seconds_to_identity_revert\", LongType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_is_identity_revert\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_is_from_before_page_creation\", BooleanType(), nullable = True),\n",
 | 
						|
    "    StructField(\"revision_tags_string\", StringType(), nullable = True)\n",
 | 
						|
    "])"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 7,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "'/usr/lib/jvm/java-11-openjdk-amd64'"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 7,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "os.environ['JAVA_HOME']"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 8,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stderr",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Setting default log level to \"WARN\".\n",
 | 
						|
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 | 
						|
      "25/01/16 12:17:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "spark = SparkSession.builder.appName('activityData').config(\"spark.driver.extraJavaOptions\", \"-Djava.home=/usr/lib/jvm/java-11-openjdk-amd64\").getOrCreate()\n",
 | 
						|
    "\n",
 | 
						|
    "# Note: It's important to set .option(\"quote\", \"\") to prevent spark to automaticallu use double-quotes to quote text\n",
 | 
						|
    "mediawiki_history_raw = spark.read.option(\"delimiter\", \"\\t\").option(\"quote\", \"\").schema(mediawiki_history_schema).csv(mediawiki_history_path)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 9,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "\n",
 | 
						|
    "# Unescaping and array-splitting UDFs\n",
 | 
						|
    "def unescape(str):\n",
 | 
						|
    "    if (str is None):\n",
 | 
						|
    "        return None\n",
 | 
						|
    "    else:\n",
 | 
						|
    "        return str.replace(\"\\\\n\", \"\\n\").replace(\"\\\\r\", \"\\r\").replace(\"\\\\t\", \"\\t\")\n",
 | 
						|
    "# The comma splitter applies a negative lookahead for \\ to prevent splitting escaped commas\n",
 | 
						|
    "def toArray(str):\n",
 | 
						|
    "    if (str is None):\n",
 | 
						|
    "        return []\n",
 | 
						|
    "    else:\n",
 | 
						|
    "        return [s.strip().replace(\"\\\\,\", \",\") for s in re.split(\"(?<!\\\\\\\\),\", unescape(str))]"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 10,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "<function __main__.toArray(str)>"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 10,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "spark.udf.register(\"unescape\", unescape, StringType())\n",
 | 
						|
    "spark.udf.register(\"to_array\", toArray, ArrayType(StringType(), False))"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 11,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "mediawiki_history = mediawiki_history_raw.selectExpr(\n",
 | 
						|
    "  \n",
 | 
						|
    "  \"wiki_db\",\n",
 | 
						|
    "  \"event_entity\",\n",
 | 
						|
    "  \"event_type\",\n",
 | 
						|
    "  \"event_timestamp\",\n",
 | 
						|
    "  \"unescape(event_comment_escaped) AS event_comment\",\n",
 | 
						|
    "  \n",
 | 
						|
    "  \"event_user_id\",\n",
 | 
						|
    "  \"unescape(event_user_text_historical_escaped) AS event_user_text_historical\",\n",
 | 
						|
    "  \"unescape(event_user_text_escaped) AS event_user_text\",\n",
 | 
						|
    "  \"to_array(event_user_blocks_historical_string) AS event_user_blocks_historical\",\n",
 | 
						|
    "  \"to_array(event_user_blocks_string) AS event_user_blocks\",\n",
 | 
						|
    "  \"to_array(event_user_groups_historical_string) AS event_user_groups_historical\",\n",
 | 
						|
    "  \"to_array(event_user_groups_string) AS event_user_groups\",\n",
 | 
						|
    "  \"to_array(event_user_is_bot_by_historical_string) AS event_user_is_bot_by_historical\",\n",
 | 
						|
    "  \"to_array(event_user_is_bot_by_string) AS event_user_is_bot_by\",\n",
 | 
						|
    "  \"event_user_is_created_by_self\",\n",
 | 
						|
    "  \"event_user_is_created_by_system\",\n",
 | 
						|
    "  \"event_user_is_created_by_peer\",\n",
 | 
						|
    "  \"event_user_is_anonymous\",\n",
 | 
						|
    "  \"event_user_registration_timestamp\",\n",
 | 
						|
    "  \"event_user_creation_timestamp\",\n",
 | 
						|
    "  \"event_user_first_edit_timestamp\",\n",
 | 
						|
    "  \"event_user_revision_count\",\n",
 | 
						|
    "  \"event_user_seconds_since_previous_revision\",\n",
 | 
						|
    "  \n",
 | 
						|
    "  \"page_id\",\n",
 | 
						|
    "  \"unescape(page_title_historical_escaped) AS page_title_historical\",\n",
 | 
						|
    "  \"unescape(page_title_escaped) AS page_title\",\n",
 | 
						|
    "  \"page_namespace_historical\",\n",
 | 
						|
    "  \"page_namespace_is_content_historical\",\n",
 | 
						|
    "  \"page_namespace\",\n",
 | 
						|
    "  \"page_namespace_is_content\",\n",
 | 
						|
    "  \"page_is_redirect\",\n",
 | 
						|
    "  \"page_is_deleted\",\n",
 | 
						|
    "  \"page_creation_timestamp\",\n",
 | 
						|
    "  \"page_first_edit_timestamp\",\n",
 | 
						|
    "  \"page_revision_count\",\n",
 | 
						|
    "  \"page_seconds_since_previous_revision\",\n",
 | 
						|
    "  \n",
 | 
						|
    "  \"user_id\",\n",
 | 
						|
    "  \"unescape(user_text_historical_escaped) AS user_text_historical\",\n",
 | 
						|
    "  \"unescape(user_text_escaped) AS user_text\",\n",
 | 
						|
    "  \"to_array(user_blocks_historical_string) AS user_blocks_historical\",\n",
 | 
						|
    "  \"to_array(user_blocks_string) AS user_blocks\",\n",
 | 
						|
    "  \"to_array(user_groups_historical_string) AS user_groups_historical\",\n",
 | 
						|
    "  \"to_array(user_groups_string) AS user_groups\",\n",
 | 
						|
    "  \"to_array(user_is_bot_by_historical_string) AS user_is_bot_by_historical\",\n",
 | 
						|
    "  \"to_array(user_is_bot_by_string) AS user_is_bot_by\",\n",
 | 
						|
    "  \"user_is_created_by_self\",\n",
 | 
						|
    "  \"user_is_created_by_system\",\n",
 | 
						|
    "  \"user_is_created_by_peer\",\n",
 | 
						|
    "  \"user_is_anonymous\",\n",
 | 
						|
    "  \"user_registration_timestamp\",\n",
 | 
						|
    "  \"user_creation_timestamp\",\n",
 | 
						|
    "  \"user_first_edit_timestamp\",\n",
 | 
						|
    "  \n",
 | 
						|
    "  \"revision_id\",\n",
 | 
						|
    "  \"revision_parent_id\",\n",
 | 
						|
    "  \"revision_minor_edit\",\n",
 | 
						|
    "  \"to_array(revision_deleted_parts_string) AS revision_deleted_parts\",\n",
 | 
						|
    "  \"revision_deleted_parts_are_suppressed\",\n",
 | 
						|
    "  \"revision_text_bytes\",\n",
 | 
						|
    "  \"revision_text_bytes_diff\",\n",
 | 
						|
    "  \"revision_text_sha1\",\n",
 | 
						|
    "  \"revision_content_model\",\n",
 | 
						|
    "  \"revision_content_format\",\n",
 | 
						|
    "  \"revision_is_deleted_by_page_deletion\",\n",
 | 
						|
    "  \"revision_deleted_by_page_deletion_timestamp\",\n",
 | 
						|
    "  \"revision_is_identity_reverted\",\n",
 | 
						|
    "  \"revision_first_identity_reverting_revision_id\",\n",
 | 
						|
    "  \"revision_seconds_to_identity_revert\",\n",
 | 
						|
    "  \"revision_is_identity_revert\",\n",
 | 
						|
    "  \"revision_is_from_before_page_creation\",\n",
 | 
						|
    "  \"to_array(revision_tags_string) AS revision_tags\"\n",
 | 
						|
    ")\n"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 12,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "'\\nmediawiki_history.   where(\"event_entity = \\'revision\\' and event_type = \\'create\\'\").   selectExpr(\"wiki_db\", \"SUBSTR(event_timestamp, 0, 7) as month\").   where(\"month = \\'2019-12\\'\").   groupBy(\"wiki_db\", \"month\").   agg(count(lit(1)).alias(\"revision_count\")).   sort(desc(\"revision_count\")).   show(10, False)\\n'"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 12,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "'''\n",
 | 
						|
    "mediawiki_history. \\\n",
 | 
						|
    "  where(\"event_entity = 'revision' and event_type = 'create'\"). \\\n",
 | 
						|
    "  selectExpr(\"wiki_db\", \"SUBSTR(event_timestamp, 0, 7) as month\"). \\\n",
 | 
						|
    "  where(\"month = '2019-12'\"). \\\n",
 | 
						|
    "  groupBy(\"wiki_db\", \"month\"). \\\n",
 | 
						|
    "  agg(count(lit(1)).alias(\"revision_count\")). \\\n",
 | 
						|
    "  sort(desc(\"revision_count\")). \\\n",
 | 
						|
    "  show(10, False)\n",
 | 
						|
    "'''"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 15,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stderr",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "ERROR:root:KeyboardInterrupt while sending command.            (64 + 56) / 8283]\n",
 | 
						|
      "Traceback (most recent call last):\n",
 | 
						|
      "  File \"/opt/conda-analytics/lib/python3.10/site-packages/py4j/java_gateway.py\", line 1038, in send_command\n",
 | 
						|
      "    response = connection.send_command(command)\n",
 | 
						|
      "  File \"/opt/conda-analytics/lib/python3.10/site-packages/py4j/clientserver.py\", line 511, in send_command\n",
 | 
						|
      "    answer = smart_decode(self.stream.readline()[:-1])\n",
 | 
						|
      "  File \"/opt/conda-analytics/lib/python3.10/socket.py\", line 717, in readinto\n",
 | 
						|
      "    return self._sock.recv_into(b)\n",
 | 
						|
      "KeyboardInterrupt\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "ename": "KeyboardInterrupt",
 | 
						|
     "evalue": "",
 | 
						|
     "output_type": "error",
 | 
						|
     "traceback": [
 | 
						|
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 | 
						|
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
 | 
						|
      "Cell \u001b[0;32mIn[15], line 6\u001b[0m\n\u001b[1;32m      2\u001b[0m activity_count_df \u001b[38;5;241m=\u001b[39m activity_count_df\u001b[38;5;241m.\u001b[39mselectExpr(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSUBSTR(event_timestamp, 0, 10) as day\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mevent_entity\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mevent_type\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      3\u001b[0m activity_count_df \u001b[38;5;241m=\u001b[39m activity_count_df\u001b[38;5;241m.\u001b[39mgroupBy(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mday\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mevent_entity\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mevent_type\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39magg(count(lit(\u001b[38;5;241m1\u001b[39m))\u001b[38;5;241m.\u001b[39malias(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mactivity_count\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m      4\u001b[0m \u001b[43mactivity_count_df\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m\\\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m    \u001b[49m\u001b[43msort\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdesc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mday\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m----> 6\u001b[0m \u001b[43m    \u001b[49m\u001b[43mshow\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/pyspark/sql/dataframe.py:947\u001b[0m, in \u001b[0;36mDataFrame.show\u001b[0;34m(self, n, truncate, vertical)\u001b[0m\n\u001b[1;32m    887\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mshow\u001b[39m(\u001b[38;5;28mself\u001b[39m, n: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m20\u001b[39m, truncate: Union[\u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28mint\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m, vertical: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    888\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Prints the first ``n`` rows to the console.\u001b[39;00m\n\u001b[1;32m    889\u001b[0m \n\u001b[1;32m    890\u001b[0m \u001b[38;5;124;03m    .. versionadded:: 1.3.0\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    945\u001b[0m \u001b[38;5;124;03m    name | Bob\u001b[39;00m\n\u001b[1;32m    946\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 947\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_show_string\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtruncate\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvertical\u001b[49m\u001b[43m)\u001b[49m)\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/pyspark/sql/dataframe.py:978\u001b[0m, in \u001b[0;36mDataFrame._show_string\u001b[0;34m(self, n, truncate, vertical)\u001b[0m\n\u001b[1;32m    969\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m:\n\u001b[1;32m    970\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m PySparkTypeError(\n\u001b[1;32m    971\u001b[0m         error_class\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNOT_BOOL\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    972\u001b[0m         message_parameters\u001b[38;5;241m=\u001b[39m{\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    975\u001b[0m         },\n\u001b[1;32m    976\u001b[0m     )\n\u001b[0;32m--> 978\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshowString\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mint_truncate\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvertical\u001b[49m\u001b[43m)\u001b[49m\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/py4j/java_gateway.py:1321\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m   1314\u001b[0m args_command, temp_args \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_build_args(\u001b[38;5;241m*\u001b[39margs)\n\u001b[1;32m   1316\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m   1317\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m   1318\u001b[0m     args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m   1319\u001b[0m     proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[0;32m-> 1321\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend_command\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1322\u001b[0m return_value \u001b[38;5;241m=\u001b[39m get_return_value(\n\u001b[1;32m   1323\u001b[0m     answer, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget_id, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m   1325\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/py4j/java_gateway.py:1038\u001b[0m, in \u001b[0;36mGatewayClient.send_command\u001b[0;34m(self, command, retry, binary)\u001b[0m\n\u001b[1;32m   1036\u001b[0m connection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_connection()\n\u001b[1;32m   1037\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1038\u001b[0m     response \u001b[38;5;241m=\u001b[39m \u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend_command\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1039\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m binary:\n\u001b[1;32m   1040\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m response, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_create_connection_guard(connection)\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/py4j/clientserver.py:511\u001b[0m, in \u001b[0;36mClientServerConnection.send_command\u001b[0;34m(self, command)\u001b[0m\n\u001b[1;32m    509\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    510\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 511\u001b[0m         answer \u001b[38;5;241m=\u001b[39m smart_decode(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstream\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m[:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m    512\u001b[0m         logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAnswer received: \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(answer))\n\u001b[1;32m    513\u001b[0m         \u001b[38;5;66;03m# Happens when a the other end is dead. There might be an empty\u001b[39;00m\n\u001b[1;32m    514\u001b[0m         \u001b[38;5;66;03m# answer before the socket raises an error.\u001b[39;00m\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/socket.py:717\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    715\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m    716\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 717\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
 | 
						|
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "name": "stderr",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "25/01/16 12:21:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
 | 
						|
      "25/01/16 12:21:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
 | 
						|
      "25/01/16 12:21:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
 | 
						|
      "25/01/16 12:21:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
 | 
						|
      "[Stage 0:==>                                                  (397 + 56) / 8283]\r"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "activity_count_df = mediawiki_history.where(\"event_user_is_bot_by_historical is not null and event_user_is_bot_by is not null\")\n",
 | 
						|
    "activity_count_df = activity_count_df.selectExpr(\"wiki_db\", \"SUBSTR(event_timestamp, 0, 10) as day\", \"event_entity\", \"event_type\")\n",
 | 
						|
    "activity_count_df = activity_count_df.groupBy(\"wiki_db\", \"day\", \"event_entity\", \"event_type\").agg(count(lit(1)).alias(\"activity_count\"))\n",
 | 
						|
    "activity_count_df.\\\n",
 | 
						|
    "    sort(desc(\"day\")). \\\n",
 | 
						|
    "    show(10, False)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 17,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "+------------+--------------+\n",
 | 
						|
      "|wiki_db     |revision_count|\n",
 | 
						|
      "+------------+--------------+\n",
 | 
						|
      "|kwwiki      |16625         |\n",
 | 
						|
      "|kowikiquote |6779          |\n",
 | 
						|
      "|zuwiktionary|3595          |\n",
 | 
						|
      "+------------+--------------+\n",
 | 
						|
      "\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "'''\n",
 | 
						|
    "filtered_df = mediawiki_history. \\\n",
 | 
						|
    "  where(\"event_entity = 'user' and event_type='create'\"). \\\n",
 | 
						|
    "  drop(\"event_user_blocks_historical\") .\\\n",
 | 
						|
    "  selectExpr(\"wiki_db\", \"SUBSTR(event_timestamp, 0, 7) as month\"). \\\n",
 | 
						|
    "  where(\"event_user_is_bot_by_historical is not null and event_user_is_bot_by is not null\"). \\\n",
 | 
						|
    "  groupBy(\"wiki_db\"). \\\n",
 | 
						|
    "  agg(count(lit(1)).alias(\"revision_count\")). \\\n",
 | 
						|
    "  sort(desc(\"revision_count\")). \\\n",
 | 
						|
    "  show(10, False)\n",
 | 
						|
    "'''"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 25,
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "ename": "Py4JJavaError",
 | 
						|
     "evalue": "An error occurred while calling o228.save.\n: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: tsv. Please find packages at `https://spark.apache.org/third-party-projects.html`.\n\tat org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)\n\tat org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:873)\n\tat org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:260)\n\tat org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:829)\nCaused by: java.lang.ClassNotFoundException: tsv.DefaultSource\n\tat java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)\n\tat java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:594)\n\tat java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:527)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)\n\tat scala.util.Try$.apply(Try.scala:213)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)\n\tat scala.util.Failure.orElse(Try.scala:224)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)\n\t... 16 more\n",
 | 
						|
     "output_type": "error",
 | 
						|
     "traceback": [
 | 
						|
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 | 
						|
      "\u001b[0;31mPy4JJavaError\u001b[0m                             Traceback (most recent call last)",
 | 
						|
      "Cell \u001b[0;32mIn[25], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfiltered_df\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mformat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtsv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtest.tsv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/pyspark/sql/readwriter.py:1463\u001b[0m, in \u001b[0;36mDataFrameWriter.save\u001b[0;34m(self, path, format, mode, partitionBy, **options)\u001b[0m\n\u001b[1;32m   1461\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jwrite\u001b[38;5;241m.\u001b[39msave()\n\u001b[1;32m   1462\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1463\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jwrite\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/py4j/java_gateway.py:1322\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m   1316\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m   1317\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m   1318\u001b[0m     args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m   1319\u001b[0m     proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m   1321\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1322\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1323\u001b[0m \u001b[43m    \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1325\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m   1326\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(temp_arg, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_detach\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/pyspark/errors/exceptions/captured.py:179\u001b[0m, in \u001b[0;36mcapture_sql_exception.<locals>.deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m    177\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeco\u001b[39m(\u001b[38;5;241m*\u001b[39ma: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m    178\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 179\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    180\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m Py4JJavaError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    181\u001b[0m         converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n",
 | 
						|
      "File \u001b[0;32m/opt/conda-analytics/lib/python3.10/site-packages/py4j/protocol.py:326\u001b[0m, in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m    324\u001b[0m value \u001b[38;5;241m=\u001b[39m OUTPUT_CONVERTER[\u001b[38;5;28mtype\u001b[39m](answer[\u001b[38;5;241m2\u001b[39m:], gateway_client)\n\u001b[1;32m    325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m answer[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m REFERENCE_TYPE:\n\u001b[0;32m--> 326\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m Py4JJavaError(\n\u001b[1;32m    327\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m    328\u001b[0m         \u001b[38;5;28mformat\u001b[39m(target_id, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, name), value)\n\u001b[1;32m    329\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    330\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m Py4JError(\n\u001b[1;32m    331\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m. Trace:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{3}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m    332\u001b[0m         \u001b[38;5;28mformat\u001b[39m(target_id, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, name, value))\n",
 | 
						|
      "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o228.save.\n: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: tsv. Please find packages at `https://spark.apache.org/third-party-projects.html`.\n\tat org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)\n\tat org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:873)\n\tat org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:260)\n\tat org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:829)\nCaused by: java.lang.ClassNotFoundException: tsv.DefaultSource\n\tat java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)\n\tat java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:594)\n\tat java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:527)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)\n\tat scala.util.Try$.apply(Try.scala:213)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)\n\tat scala.util.Failure.orElse(Try.scala:224)\n\tat org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)\n\t... 16 more\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "activity_count_df.write.format(\"csv\").save(\"011625_dab_yearly.csv\")"
 | 
						|
   ]
 | 
						|
  }
 | 
						|
 ],
 | 
						|
 "metadata": {
 | 
						|
  "kernelspec": {
 | 
						|
   "display_name": "Python 3",
 | 
						|
   "language": "python",
 | 
						|
   "name": "python3"
 | 
						|
  },
 | 
						|
  "language_info": {
 | 
						|
   "codemirror_mode": {
 | 
						|
    "name": "ipython",
 | 
						|
    "version": 3
 | 
						|
   },
 | 
						|
   "file_extension": ".py",
 | 
						|
   "mimetype": "text/x-python",
 | 
						|
   "name": "python",
 | 
						|
   "nbconvert_exporter": "python",
 | 
						|
   "pygments_lexer": "ipython3",
 | 
						|
   "version": "3.10.15"
 | 
						|
  }
 | 
						|
 },
 | 
						|
 "nbformat": 4,
 | 
						|
 "nbformat_minor": 2
 | 
						|
}
 |