From 28df3eb7292c6eebaa21a95f6f6c7f8b1ee0df3d Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Wed, 5 Mar 2025 11:24:39 -0800 Subject: [PATCH] updated convokit modeling --- mgaughan-rstudio-server_24539959.out | 17 - .../ve_phab_convos-checkpoint.ipynb | 381 ++++++++++++++++++ text_analysis/case1/ve_phab_convos.ipynb | 381 ++++++++++++++++++ 3 files changed, 762 insertions(+), 17 deletions(-) delete mode 100644 mgaughan-rstudio-server_24539959.out create mode 100644 text_analysis/case1/.ipynb_checkpoints/ve_phab_convos-checkpoint.ipynb create mode 100644 text_analysis/case1/ve_phab_convos.ipynb diff --git a/mgaughan-rstudio-server_24539959.out b/mgaughan-rstudio-server_24539959.out deleted file mode 100644 index 3e9c1e3..0000000 --- a/mgaughan-rstudio-server_24539959.out +++ /dev/null @@ -1,17 +0,0 @@ -1. SSH tunnel from your workstation using the following command: - - ssh -N -L 8787:n3439:56597 mjilg@klone.hyak.uw.edu - - and point your web browser to http://localhost:8787 - -2. log in to RStudio Server using the following credentials: - - user: mjilg - password: wO04FrVKQP5bSLRuzEi5 - -When done using RStudio Server, terminate the job by: - -1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) -2. Issue the following command on the login node: - - scancel -f 24539959 diff --git a/text_analysis/case1/.ipynb_checkpoints/ve_phab_convos-checkpoint.ipynb b/text_analysis/case1/.ipynb_checkpoints/ve_phab_convos-checkpoint.ipynb new file mode 100644 index 0000000..540d561 --- /dev/null +++ b/text_analysis/case1/.ipynb_checkpoints/ve_phab_convos-checkpoint.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "fa71c616-e22d-4f6e-9599-34d85c05179b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n" + ] + } + ], + "source": [ + "import convokit\n", + "import pandas as pd\n", + "from convokit import Corpus, download" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "e9c7100b-308c-4a57-bc53-91318d081cbb", + "metadata": {}, + "outputs": [], + "source": [ + "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n", + "phab_df = pd.read_csv(phab_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "101909ac-4794-4d35-aa8d-76ea301ef397", + "metadata": {}, + "outputs": [], + "source": [ + "#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n", + "phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n", + "#cleaning df\n", + "phab_df['id'] = phab_df.index + 1\n", + "#may have to build out the reply_to column \n", + "phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n", + "phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n", + "\n", + "phab_df = phab_df.rename(columns={\n", + " 'date_created': 'timestamp',\n", + " 'comment_text': 'text',\n", + " 'AuthorPHID': 'speaker',\n", + " 'TaskPHID': 'conversation_id',\n", + " 'WMFaffil':'meta.affil',\n", + " 'isGerrit': 'meta.gerrit'\n", + "})\n", + "# after 11-1-2012 before 11-1-2013\n", + "filtered_phab_df = phab_df[(phab_df['timestamp'] < 1383264000) & (phab_df['timestamp'] > 1351728000)]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7f1aed21-3dbb-40f4-b275-461c35e5d07c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "10277it [00:00, 30651.56it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 20708 has been casted to a string.\n", + "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 28751 has been casted to a string.\n", + "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 29804 has been casted to a string.\n", + "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 31861 has been casted to a string.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "phab_corpus = Corpus.from_pandas(filtered_phab_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "f0d1f727-c16f-4b61-b83a-e462c15bbcb5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of Speakers: 236\n", + "Number of Utterances: 10277\n", + "Number of Conversations: 2205\n" + ] + } + ], + "source": [ + "phab_corpus.print_summary_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "e2a9cb88-876f-416b-ba9e-665170b24aa9", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'This BackendMapper does not have an entry for the meta with id utterance_711.'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m \u001b[43mCorpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mphab_corpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n", + "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:647\u001b[0m, in \u001b[0;36mCorpus.filter_utterances\u001b[0;34m(source_corpus, selector)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfilter_utterances\u001b[39m(source_corpus: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCorpus\u001b[39m\u001b[38;5;124m\"\u001b[39m, selector: Callable[[Utterance], \u001b[38;5;28mbool\u001b[39m]):\n\u001b[1;32m 637\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 638\u001b[0m \u001b[38;5;124;03m Returns a new corpus that includes only a subset of Utterances from the source Corpus. This filtering provides no\u001b[39;00m\n\u001b[1;32m 639\u001b[0m \u001b[38;5;124;03m guarantees with regard to maintaining conversational integrity and should be used with care.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[38;5;124;03m :return: a new Corpus with a subset of the Utterances\u001b[39;00m\n\u001b[1;32m 646\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 647\u001b[0m utts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msource_corpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 648\u001b[0m new_corpus \u001b[38;5;241m=\u001b[39m Corpus(utterances\u001b[38;5;241m=\u001b[39mutts)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m convo \u001b[38;5;129;01min\u001b[39;00m new_corpus\u001b[38;5;241m.\u001b[39miter_conversations():\n", + "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:444\u001b[0m, in \u001b[0;36mCorpus.iter_utterances\u001b[0;34m(self, selector)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 437\u001b[0m \u001b[38;5;124;03mGet utterances in the Corpus, with an optional selector that filters for Utterances that should be included.\u001b[39;00m\n\u001b[1;32m 438\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;124;03m:return: a generator of Utterances\u001b[39;00m\n\u001b[1;32m 442\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mutterances\u001b[38;5;241m.\u001b[39mvalues():\n\u001b[0;32m--> 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m v\n", + "Cell \u001b[0;32mIn[35], line 1\u001b[0m, in \u001b[0;36m\u001b[0;34m(utt)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m Corpus\u001b[38;5;241m.\u001b[39mfilter_utterances(phab_corpus, \u001b[38;5;28;01mlambda\u001b[39;00m utt: \u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n", + "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/convoKitMeta.py:37\u001b[0m, in \u001b[0;36mConvoKitMeta.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, item):\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# in DB mode, metadata field mutation would not be updated. (ex. mutating dict/list metadata fields)\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;66;03m# we align MEM mode behavior and DB mode by making deepcopy of metadata fields, so mutation no longer\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# affect corpus metadata backend, but only acting on the copy of it.\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmeta\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_index\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobj_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 40\u001b[0m immutable_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28mcomplex\u001b[39m, \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mfrozenset\u001b[39m)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, immutable_types):\n", + "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/backendMapper.py:173\u001b[0m, in \u001b[0;36mMemMapper.get_data\u001b[0;34m(self, component_type, component_id, property_name, index)\u001b[0m\n\u001b[1;32m 171\u001b[0m collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_collection(component_type)\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m component_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m collection:\n\u001b[0;32m--> 173\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\n\u001b[1;32m 174\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis BackendMapper does not have an entry for the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 175\u001b[0m )\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m property_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m collection[component_id]\n", + "\u001b[0;31mKeyError\u001b[0m: 'This BackendMapper does not have an entry for the meta with id utterance_711.'" + ] + } + ], + "source": [ + "no_bots_phab_corpus = Corpus.filter_utterances(phab_corpus, lambda utt: utt.meta['gerrit'] != True)\n", + "no_bots_phab_corpus.print_summary_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "dde1dde5-15b5-4291-997f-a0d772e5ecbc", + "metadata": {}, + "outputs": [], + "source": [ + "#looking at how language use differs between the two groups \n", + "from convokit.text_processing import TextParser\n", + "parser = TextParser(input_field='text')" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "146d6526-b80f-4981-af2d-277b90852d5f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "050/9245 utterances processed\n", + "100/9245 utterances processed\n", + "150/9245 utterances processed\n", + "200/9245 utterances processed\n", + "250/9245 utterances processed\n", + "300/9245 utterances processed\n", + "350/9245 utterances processed\n", + "400/9245 utterances processed\n", + "450/9245 utterances processed\n", + "500/9245 utterances processed\n", + "550/9245 utterances processed\n", + "600/9245 utterances processed\n", + "650/9245 utterances processed\n", + "700/9245 utterances processed\n", + "750/9245 utterances processed\n", + "800/9245 utterances processed\n", + "850/9245 utterances processed\n", + "900/9245 utterances processed\n", + "950/9245 utterances processed\n", + "1000/9245 utterances processed\n", + "1050/9245 utterances processed\n", + "1100/9245 utterances processed\n", + "1150/9245 utterances processed\n", + "1200/9245 utterances processed\n", + "1250/9245 utterances processed\n", + "1300/9245 utterances processed\n", + "1350/9245 utterances processed\n", + "1400/9245 utterances processed\n", + "1450/9245 utterances processed\n", + "1500/9245 utterances processed\n", + "1550/9245 utterances processed\n", + "1600/9245 utterances processed\n", + "1650/9245 utterances processed\n", + "1700/9245 utterances processed\n", + "1750/9245 utterances processed\n", + "1800/9245 utterances processed\n", + "1850/9245 utterances processed\n", + "1900/9245 utterances processed\n", + "1950/9245 utterances processed\n", + "2000/9245 utterances processed\n", + "2050/9245 utterances processed\n", + "2100/9245 utterances processed\n", + "2150/9245 utterances processed\n", + "2200/9245 utterances processed\n", + "2250/9245 utterances processed\n", + "2300/9245 utterances processed\n", + "2350/9245 utterances processed\n", + "2400/9245 utterances processed\n", + "2450/9245 utterances processed\n", + "2500/9245 utterances processed\n", + "2550/9245 utterances processed\n", + "2600/9245 utterances processed\n", + "2650/9245 utterances processed\n", + "2700/9245 utterances processed\n", + "2750/9245 utterances processed\n", + "2800/9245 utterances processed\n", + "2850/9245 utterances processed\n", + "2900/9245 utterances processed\n", + "2950/9245 utterances processed\n", + "3000/9245 utterances processed\n", + "3050/9245 utterances processed\n", + "3100/9245 utterances processed\n", + "3150/9245 utterances processed\n", + "3200/9245 utterances processed\n", + "3250/9245 utterances processed\n", + "3300/9245 utterances processed\n", + "3350/9245 utterances processed\n", + "3400/9245 utterances processed\n", + "3450/9245 utterances processed\n", + "3500/9245 utterances processed\n", + "3550/9245 utterances processed\n", + "3600/9245 utterances processed\n", + "3650/9245 utterances processed\n", + "3700/9245 utterances processed\n", + "3750/9245 utterances processed\n", + "3800/9245 utterances processed\n", + "3850/9245 utterances processed\n", + "3900/9245 utterances processed\n", + "3950/9245 utterances processed\n", + "4000/9245 utterances processed\n", + "4050/9245 utterances processed\n", + "4100/9245 utterances processed\n", + "4150/9245 utterances processed\n", + "4200/9245 utterances processed\n", + "4250/9245 utterances processed\n", + "4300/9245 utterances processed\n", + "4350/9245 utterances processed\n", + "4400/9245 utterances processed\n", + "4450/9245 utterances processed\n", + "4500/9245 utterances processed\n", + "4550/9245 utterances processed\n", + "4600/9245 utterances processed\n", + "4650/9245 utterances processed\n", + "4700/9245 utterances processed\n", + "4750/9245 utterances processed\n", + "4800/9245 utterances processed\n", + "4850/9245 utterances processed\n", + "4900/9245 utterances processed\n", + "4950/9245 utterances processed\n", + "5000/9245 utterances processed\n", + "5050/9245 utterances processed\n", + "5100/9245 utterances processed\n", + "5150/9245 utterances processed\n", + "5200/9245 utterances processed\n", + "5250/9245 utterances processed\n", + "5300/9245 utterances processed\n", + "5350/9245 utterances processed\n", + "5400/9245 utterances processed\n", + "5450/9245 utterances processed\n", + "5500/9245 utterances processed\n", + "5550/9245 utterances processed\n", + "5600/9245 utterances processed\n", + "5650/9245 utterances processed\n", + "5700/9245 utterances processed\n", + "5750/9245 utterances processed\n", + "5800/9245 utterances processed\n", + "5850/9245 utterances processed\n", + "5900/9245 utterances processed\n", + "5950/9245 utterances processed\n", + "6000/9245 utterances processed\n", + "6050/9245 utterances processed\n", + "6100/9245 utterances processed\n", + "6150/9245 utterances processed\n", + "6200/9245 utterances processed\n", + "6250/9245 utterances processed\n", + "6300/9245 utterances processed\n", + "6350/9245 utterances processed\n", + "6400/9245 utterances processed\n", + "6450/9245 utterances processed\n", + "6500/9245 utterances processed\n", + "6550/9245 utterances processed\n", + "6600/9245 utterances processed\n", + "6650/9245 utterances processed\n", + "6700/9245 utterances processed\n", + "6750/9245 utterances processed\n", + "6800/9245 utterances processed\n", + "6850/9245 utterances processed\n", + "6900/9245 utterances processed\n", + "6950/9245 utterances processed\n", + "7000/9245 utterances processed\n", + "7050/9245 utterances processed\n", + "7100/9245 utterances processed\n", + "7150/9245 utterances processed\n", + "7200/9245 utterances processed\n", + "7250/9245 utterances processed\n", + "7300/9245 utterances processed\n", + "7350/9245 utterances processed\n", + "7400/9245 utterances processed\n", + "7450/9245 utterances processed\n", + "7500/9245 utterances processed\n", + "7550/9245 utterances processed\n", + "7600/9245 utterances processed\n", + "7650/9245 utterances processed\n", + "7700/9245 utterances processed\n", + "7750/9245 utterances processed\n", + "7800/9245 utterances processed\n", + "7850/9245 utterances processed\n", + "7900/9245 utterances processed\n", + "7950/9245 utterances processed\n", + "8000/9245 utterances processed\n", + "8050/9245 utterances processed\n", + "8100/9245 utterances processed\n", + "8150/9245 utterances processed\n", + "8200/9245 utterances processed\n", + "8250/9245 utterances processed\n", + "8300/9245 utterances processed\n", + "8350/9245 utterances processed\n", + "8400/9245 utterances processed\n", + "8450/9245 utterances processed\n", + "8500/9245 utterances processed\n", + "8550/9245 utterances processed\n", + "8600/9245 utterances processed\n", + "8650/9245 utterances processed\n", + "8700/9245 utterances processed\n", + "8750/9245 utterances processed\n", + "8800/9245 utterances processed\n", + "8850/9245 utterances processed\n", + "8900/9245 utterances processed\n", + "8950/9245 utterances processed\n", + "9000/9245 utterances processed\n", + "9050/9245 utterances processed\n", + "9100/9245 utterances processed\n", + "9150/9245 utterances processed\n", + "9200/9245 utterances processed\n", + "9245/9245 utterances processed\n" + ] + } + ], + "source": [ + "no_bots_phab_corpus = parser.transform(no_bots_phab_corpus)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/text_analysis/case1/ve_phab_convos.ipynb b/text_analysis/case1/ve_phab_convos.ipynb new file mode 100644 index 0000000..540d561 --- /dev/null +++ b/text_analysis/case1/ve_phab_convos.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "fa71c616-e22d-4f6e-9599-34d85c05179b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n" + ] + } + ], + "source": [ + "import convokit\n", + "import pandas as pd\n", + "from convokit import Corpus, download" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "e9c7100b-308c-4a57-bc53-91318d081cbb", + "metadata": {}, + "outputs": [], + "source": [ + "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n", + "phab_df = pd.read_csv(phab_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "101909ac-4794-4d35-aa8d-76ea301ef397", + "metadata": {}, + "outputs": [], + "source": [ + "#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n", + "phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n", + "#cleaning df\n", + "phab_df['id'] = phab_df.index + 1\n", + "#may have to build out the reply_to column \n", + "phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n", + "phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n", + "\n", + "phab_df = phab_df.rename(columns={\n", + " 'date_created': 'timestamp',\n", + " 'comment_text': 'text',\n", + " 'AuthorPHID': 'speaker',\n", + " 'TaskPHID': 'conversation_id',\n", + " 'WMFaffil':'meta.affil',\n", + " 'isGerrit': 'meta.gerrit'\n", + "})\n", + "# after 11-1-2012 before 11-1-2013\n", + "filtered_phab_df = phab_df[(phab_df['timestamp'] < 1383264000) & (phab_df['timestamp'] > 1351728000)]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7f1aed21-3dbb-40f4-b275-461c35e5d07c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "10277it [00:00, 30651.56it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 20708 has been casted to a string.\n", + "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 28751 has been casted to a string.\n", + "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 29804 has been casted to a string.\n", + "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 31861 has been casted to a string.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "phab_corpus = Corpus.from_pandas(filtered_phab_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "f0d1f727-c16f-4b61-b83a-e462c15bbcb5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of Speakers: 236\n", + "Number of Utterances: 10277\n", + "Number of Conversations: 2205\n" + ] + } + ], + "source": [ + "phab_corpus.print_summary_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "e2a9cb88-876f-416b-ba9e-665170b24aa9", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'This BackendMapper does not have an entry for the meta with id utterance_711.'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m \u001b[43mCorpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mphab_corpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n", + "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:647\u001b[0m, in \u001b[0;36mCorpus.filter_utterances\u001b[0;34m(source_corpus, selector)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfilter_utterances\u001b[39m(source_corpus: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCorpus\u001b[39m\u001b[38;5;124m\"\u001b[39m, selector: Callable[[Utterance], \u001b[38;5;28mbool\u001b[39m]):\n\u001b[1;32m 637\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 638\u001b[0m \u001b[38;5;124;03m Returns a new corpus that includes only a subset of Utterances from the source Corpus. This filtering provides no\u001b[39;00m\n\u001b[1;32m 639\u001b[0m \u001b[38;5;124;03m guarantees with regard to maintaining conversational integrity and should be used with care.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[38;5;124;03m :return: a new Corpus with a subset of the Utterances\u001b[39;00m\n\u001b[1;32m 646\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 647\u001b[0m utts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msource_corpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 648\u001b[0m new_corpus \u001b[38;5;241m=\u001b[39m Corpus(utterances\u001b[38;5;241m=\u001b[39mutts)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m convo \u001b[38;5;129;01min\u001b[39;00m new_corpus\u001b[38;5;241m.\u001b[39miter_conversations():\n", + "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:444\u001b[0m, in \u001b[0;36mCorpus.iter_utterances\u001b[0;34m(self, selector)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 437\u001b[0m \u001b[38;5;124;03mGet utterances in the Corpus, with an optional selector that filters for Utterances that should be included.\u001b[39;00m\n\u001b[1;32m 438\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;124;03m:return: a generator of Utterances\u001b[39;00m\n\u001b[1;32m 442\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mutterances\u001b[38;5;241m.\u001b[39mvalues():\n\u001b[0;32m--> 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m v\n", + "Cell \u001b[0;32mIn[35], line 1\u001b[0m, in \u001b[0;36m\u001b[0;34m(utt)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m Corpus\u001b[38;5;241m.\u001b[39mfilter_utterances(phab_corpus, \u001b[38;5;28;01mlambda\u001b[39;00m utt: \u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n", + "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/convoKitMeta.py:37\u001b[0m, in \u001b[0;36mConvoKitMeta.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, item):\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# in DB mode, metadata field mutation would not be updated. (ex. mutating dict/list metadata fields)\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;66;03m# we align MEM mode behavior and DB mode by making deepcopy of metadata fields, so mutation no longer\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# affect corpus metadata backend, but only acting on the copy of it.\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmeta\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_index\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobj_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 40\u001b[0m immutable_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28mcomplex\u001b[39m, \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mfrozenset\u001b[39m)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, immutable_types):\n", + "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/backendMapper.py:173\u001b[0m, in \u001b[0;36mMemMapper.get_data\u001b[0;34m(self, component_type, component_id, property_name, index)\u001b[0m\n\u001b[1;32m 171\u001b[0m collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_collection(component_type)\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m component_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m collection:\n\u001b[0;32m--> 173\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\n\u001b[1;32m 174\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis BackendMapper does not have an entry for the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 175\u001b[0m )\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m property_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m collection[component_id]\n", + "\u001b[0;31mKeyError\u001b[0m: 'This BackendMapper does not have an entry for the meta with id utterance_711.'" + ] + } + ], + "source": [ + "no_bots_phab_corpus = Corpus.filter_utterances(phab_corpus, lambda utt: utt.meta['gerrit'] != True)\n", + "no_bots_phab_corpus.print_summary_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "dde1dde5-15b5-4291-997f-a0d772e5ecbc", + "metadata": {}, + "outputs": [], + "source": [ + "#looking at how language use differs between the two groups \n", + "from convokit.text_processing import TextParser\n", + "parser = TextParser(input_field='text')" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "146d6526-b80f-4981-af2d-277b90852d5f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "050/9245 utterances processed\n", + "100/9245 utterances processed\n", + "150/9245 utterances processed\n", + "200/9245 utterances processed\n", + "250/9245 utterances processed\n", + "300/9245 utterances processed\n", + "350/9245 utterances processed\n", + "400/9245 utterances processed\n", + "450/9245 utterances processed\n", + "500/9245 utterances processed\n", + "550/9245 utterances processed\n", + "600/9245 utterances processed\n", + "650/9245 utterances processed\n", + "700/9245 utterances processed\n", + "750/9245 utterances processed\n", + "800/9245 utterances processed\n", + "850/9245 utterances processed\n", + "900/9245 utterances processed\n", + "950/9245 utterances processed\n", + "1000/9245 utterances processed\n", + "1050/9245 utterances processed\n", + "1100/9245 utterances processed\n", + "1150/9245 utterances processed\n", + "1200/9245 utterances processed\n", + "1250/9245 utterances processed\n", + "1300/9245 utterances processed\n", + "1350/9245 utterances processed\n", + "1400/9245 utterances processed\n", + "1450/9245 utterances processed\n", + "1500/9245 utterances processed\n", + "1550/9245 utterances processed\n", + "1600/9245 utterances processed\n", + "1650/9245 utterances processed\n", + "1700/9245 utterances processed\n", + "1750/9245 utterances processed\n", + "1800/9245 utterances processed\n", + "1850/9245 utterances processed\n", + "1900/9245 utterances processed\n", + "1950/9245 utterances processed\n", + "2000/9245 utterances processed\n", + "2050/9245 utterances processed\n", + "2100/9245 utterances processed\n", + "2150/9245 utterances processed\n", + "2200/9245 utterances processed\n", + "2250/9245 utterances processed\n", + "2300/9245 utterances processed\n", + "2350/9245 utterances processed\n", + "2400/9245 utterances processed\n", + "2450/9245 utterances processed\n", + "2500/9245 utterances processed\n", + "2550/9245 utterances processed\n", + "2600/9245 utterances processed\n", + "2650/9245 utterances processed\n", + "2700/9245 utterances processed\n", + "2750/9245 utterances processed\n", + "2800/9245 utterances processed\n", + "2850/9245 utterances processed\n", + "2900/9245 utterances processed\n", + "2950/9245 utterances processed\n", + "3000/9245 utterances processed\n", + "3050/9245 utterances processed\n", + "3100/9245 utterances processed\n", + "3150/9245 utterances processed\n", + "3200/9245 utterances processed\n", + "3250/9245 utterances processed\n", + "3300/9245 utterances processed\n", + "3350/9245 utterances processed\n", + "3400/9245 utterances processed\n", + "3450/9245 utterances processed\n", + "3500/9245 utterances processed\n", + "3550/9245 utterances processed\n", + "3600/9245 utterances processed\n", + "3650/9245 utterances processed\n", + "3700/9245 utterances processed\n", + "3750/9245 utterances processed\n", + "3800/9245 utterances processed\n", + "3850/9245 utterances processed\n", + "3900/9245 utterances processed\n", + "3950/9245 utterances processed\n", + "4000/9245 utterances processed\n", + "4050/9245 utterances processed\n", + "4100/9245 utterances processed\n", + "4150/9245 utterances processed\n", + "4200/9245 utterances processed\n", + "4250/9245 utterances processed\n", + "4300/9245 utterances processed\n", + "4350/9245 utterances processed\n", + "4400/9245 utterances processed\n", + "4450/9245 utterances processed\n", + "4500/9245 utterances processed\n", + "4550/9245 utterances processed\n", + "4600/9245 utterances processed\n", + "4650/9245 utterances processed\n", + "4700/9245 utterances processed\n", + "4750/9245 utterances processed\n", + "4800/9245 utterances processed\n", + "4850/9245 utterances processed\n", + "4900/9245 utterances processed\n", + "4950/9245 utterances processed\n", + "5000/9245 utterances processed\n", + "5050/9245 utterances processed\n", + "5100/9245 utterances processed\n", + "5150/9245 utterances processed\n", + "5200/9245 utterances processed\n", + "5250/9245 utterances processed\n", + "5300/9245 utterances processed\n", + "5350/9245 utterances processed\n", + "5400/9245 utterances processed\n", + "5450/9245 utterances processed\n", + "5500/9245 utterances processed\n", + "5550/9245 utterances processed\n", + "5600/9245 utterances processed\n", + "5650/9245 utterances processed\n", + "5700/9245 utterances processed\n", + "5750/9245 utterances processed\n", + "5800/9245 utterances processed\n", + "5850/9245 utterances processed\n", + "5900/9245 utterances processed\n", + "5950/9245 utterances processed\n", + "6000/9245 utterances processed\n", + "6050/9245 utterances processed\n", + "6100/9245 utterances processed\n", + "6150/9245 utterances processed\n", + "6200/9245 utterances processed\n", + "6250/9245 utterances processed\n", + "6300/9245 utterances processed\n", + "6350/9245 utterances processed\n", + "6400/9245 utterances processed\n", + "6450/9245 utterances processed\n", + "6500/9245 utterances processed\n", + "6550/9245 utterances processed\n", + "6600/9245 utterances processed\n", + "6650/9245 utterances processed\n", + "6700/9245 utterances processed\n", + "6750/9245 utterances processed\n", + "6800/9245 utterances processed\n", + "6850/9245 utterances processed\n", + "6900/9245 utterances processed\n", + "6950/9245 utterances processed\n", + "7000/9245 utterances processed\n", + "7050/9245 utterances processed\n", + "7100/9245 utterances processed\n", + "7150/9245 utterances processed\n", + "7200/9245 utterances processed\n", + "7250/9245 utterances processed\n", + "7300/9245 utterances processed\n", + "7350/9245 utterances processed\n", + "7400/9245 utterances processed\n", + "7450/9245 utterances processed\n", + "7500/9245 utterances processed\n", + "7550/9245 utterances processed\n", + "7600/9245 utterances processed\n", + "7650/9245 utterances processed\n", + "7700/9245 utterances processed\n", + "7750/9245 utterances processed\n", + "7800/9245 utterances processed\n", + "7850/9245 utterances processed\n", + "7900/9245 utterances processed\n", + "7950/9245 utterances processed\n", + "8000/9245 utterances processed\n", + "8050/9245 utterances processed\n", + "8100/9245 utterances processed\n", + "8150/9245 utterances processed\n", + "8200/9245 utterances processed\n", + "8250/9245 utterances processed\n", + "8300/9245 utterances processed\n", + "8350/9245 utterances processed\n", + "8400/9245 utterances processed\n", + "8450/9245 utterances processed\n", + "8500/9245 utterances processed\n", + "8550/9245 utterances processed\n", + "8600/9245 utterances processed\n", + "8650/9245 utterances processed\n", + "8700/9245 utterances processed\n", + "8750/9245 utterances processed\n", + "8800/9245 utterances processed\n", + "8850/9245 utterances processed\n", + "8900/9245 utterances processed\n", + "8950/9245 utterances processed\n", + "9000/9245 utterances processed\n", + "9050/9245 utterances processed\n", + "9100/9245 utterances processed\n", + "9150/9245 utterances processed\n", + "9200/9245 utterances processed\n", + "9245/9245 utterances processed\n" + ] + } + ], + "source": [ + "no_bots_phab_corpus = parser.transform(no_bots_phab_corpus)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}