1
0
mw-lifecycle-analysis/text_analysis/case1/.ipynb_checkpoints/ve_phab_convos-checkpoint.ipynb
2025-03-05 11:24:39 -08:00

382 lines
21 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "fa71c616-e22d-4f6e-9599-34d85c05179b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import convokit\n",
"import pandas as pd\n",
"from convokit import Corpus, download"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "e9c7100b-308c-4a57-bc53-91318d081cbb",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "101909ac-4794-4d35-aa8d-76ea301ef397",
"metadata": {},
"outputs": [],
"source": [
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
"#cleaning df\n",
"phab_df['id'] = phab_df.index + 1\n",
"#may have to build out the reply_to column \n",
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
"\n",
"phab_df = phab_df.rename(columns={\n",
" 'date_created': 'timestamp',\n",
" 'comment_text': 'text',\n",
" 'AuthorPHID': 'speaker',\n",
" 'TaskPHID': 'conversation_id',\n",
" 'WMFaffil':'meta.affil',\n",
" 'isGerrit': 'meta.gerrit'\n",
"})\n",
"# after 11-1-2012 before 11-1-2013\n",
"filtered_phab_df = phab_df[(phab_df['timestamp'] < 1383264000) & (phab_df['timestamp'] > 1351728000)]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "7f1aed21-3dbb-40f4-b275-461c35e5d07c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"10277it [00:00, 30651.56it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 20708 has been casted to a string.\n",
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 28751 has been casted to a string.\n",
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 29804 has been casted to a string.\n",
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 31861 has been casted to a string.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"phab_corpus = Corpus.from_pandas(filtered_phab_df)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "f0d1f727-c16f-4b61-b83a-e462c15bbcb5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Speakers: 236\n",
"Number of Utterances: 10277\n",
"Number of Conversations: 2205\n"
]
}
],
"source": [
"phab_corpus.print_summary_stats()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "e2a9cb88-876f-416b-ba9e-665170b24aa9",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'This BackendMapper does not have an entry for the meta with id utterance_711.'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m \u001b[43mCorpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mphab_corpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n",
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:647\u001b[0m, in \u001b[0;36mCorpus.filter_utterances\u001b[0;34m(source_corpus, selector)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfilter_utterances\u001b[39m(source_corpus: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCorpus\u001b[39m\u001b[38;5;124m\"\u001b[39m, selector: Callable[[Utterance], \u001b[38;5;28mbool\u001b[39m]):\n\u001b[1;32m 637\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 638\u001b[0m \u001b[38;5;124;03m Returns a new corpus that includes only a subset of Utterances from the source Corpus. This filtering provides no\u001b[39;00m\n\u001b[1;32m 639\u001b[0m \u001b[38;5;124;03m guarantees with regard to maintaining conversational integrity and should be used with care.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[38;5;124;03m :return: a new Corpus with a subset of the Utterances\u001b[39;00m\n\u001b[1;32m 646\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 647\u001b[0m utts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msource_corpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 648\u001b[0m new_corpus \u001b[38;5;241m=\u001b[39m Corpus(utterances\u001b[38;5;241m=\u001b[39mutts)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m convo \u001b[38;5;129;01min\u001b[39;00m new_corpus\u001b[38;5;241m.\u001b[39miter_conversations():\n",
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:444\u001b[0m, in \u001b[0;36mCorpus.iter_utterances\u001b[0;34m(self, selector)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 437\u001b[0m \u001b[38;5;124;03mGet utterances in the Corpus, with an optional selector that filters for Utterances that should be included.\u001b[39;00m\n\u001b[1;32m 438\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;124;03m:return: a generator of Utterances\u001b[39;00m\n\u001b[1;32m 442\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mutterances\u001b[38;5;241m.\u001b[39mvalues():\n\u001b[0;32m--> 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m v\n",
"Cell \u001b[0;32mIn[35], line 1\u001b[0m, in \u001b[0;36m<lambda>\u001b[0;34m(utt)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m Corpus\u001b[38;5;241m.\u001b[39mfilter_utterances(phab_corpus, \u001b[38;5;28;01mlambda\u001b[39;00m utt: \u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n",
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/convoKitMeta.py:37\u001b[0m, in \u001b[0;36mConvoKitMeta.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, item):\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# in DB mode, metadata field mutation would not be updated. (ex. mutating dict/list metadata fields)\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;66;03m# we align MEM mode behavior and DB mode by making deepcopy of metadata fields, so mutation no longer\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# affect corpus metadata backend, but only acting on the copy of it.\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmeta\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_index\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobj_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 40\u001b[0m immutable_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28mcomplex\u001b[39m, \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mfrozenset\u001b[39m)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, immutable_types):\n",
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/backendMapper.py:173\u001b[0m, in \u001b[0;36mMemMapper.get_data\u001b[0;34m(self, component_type, component_id, property_name, index)\u001b[0m\n\u001b[1;32m 171\u001b[0m collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_collection(component_type)\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m component_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m collection:\n\u001b[0;32m--> 173\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\n\u001b[1;32m 174\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis BackendMapper does not have an entry for the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 175\u001b[0m )\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m property_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m collection[component_id]\n",
"\u001b[0;31mKeyError\u001b[0m: 'This BackendMapper does not have an entry for the meta with id utterance_711.'"
]
}
],
"source": [
"no_bots_phab_corpus = Corpus.filter_utterances(phab_corpus, lambda utt: utt.meta['gerrit'] != True)\n",
"no_bots_phab_corpus.print_summary_stats()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "dde1dde5-15b5-4291-997f-a0d772e5ecbc",
"metadata": {},
"outputs": [],
"source": [
"#looking at how language use differs between the two groups \n",
"from convokit.text_processing import TextParser\n",
"parser = TextParser(input_field='text')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "146d6526-b80f-4981-af2d-277b90852d5f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"050/9245 utterances processed\n",
"100/9245 utterances processed\n",
"150/9245 utterances processed\n",
"200/9245 utterances processed\n",
"250/9245 utterances processed\n",
"300/9245 utterances processed\n",
"350/9245 utterances processed\n",
"400/9245 utterances processed\n",
"450/9245 utterances processed\n",
"500/9245 utterances processed\n",
"550/9245 utterances processed\n",
"600/9245 utterances processed\n",
"650/9245 utterances processed\n",
"700/9245 utterances processed\n",
"750/9245 utterances processed\n",
"800/9245 utterances processed\n",
"850/9245 utterances processed\n",
"900/9245 utterances processed\n",
"950/9245 utterances processed\n",
"1000/9245 utterances processed\n",
"1050/9245 utterances processed\n",
"1100/9245 utterances processed\n",
"1150/9245 utterances processed\n",
"1200/9245 utterances processed\n",
"1250/9245 utterances processed\n",
"1300/9245 utterances processed\n",
"1350/9245 utterances processed\n",
"1400/9245 utterances processed\n",
"1450/9245 utterances processed\n",
"1500/9245 utterances processed\n",
"1550/9245 utterances processed\n",
"1600/9245 utterances processed\n",
"1650/9245 utterances processed\n",
"1700/9245 utterances processed\n",
"1750/9245 utterances processed\n",
"1800/9245 utterances processed\n",
"1850/9245 utterances processed\n",
"1900/9245 utterances processed\n",
"1950/9245 utterances processed\n",
"2000/9245 utterances processed\n",
"2050/9245 utterances processed\n",
"2100/9245 utterances processed\n",
"2150/9245 utterances processed\n",
"2200/9245 utterances processed\n",
"2250/9245 utterances processed\n",
"2300/9245 utterances processed\n",
"2350/9245 utterances processed\n",
"2400/9245 utterances processed\n",
"2450/9245 utterances processed\n",
"2500/9245 utterances processed\n",
"2550/9245 utterances processed\n",
"2600/9245 utterances processed\n",
"2650/9245 utterances processed\n",
"2700/9245 utterances processed\n",
"2750/9245 utterances processed\n",
"2800/9245 utterances processed\n",
"2850/9245 utterances processed\n",
"2900/9245 utterances processed\n",
"2950/9245 utterances processed\n",
"3000/9245 utterances processed\n",
"3050/9245 utterances processed\n",
"3100/9245 utterances processed\n",
"3150/9245 utterances processed\n",
"3200/9245 utterances processed\n",
"3250/9245 utterances processed\n",
"3300/9245 utterances processed\n",
"3350/9245 utterances processed\n",
"3400/9245 utterances processed\n",
"3450/9245 utterances processed\n",
"3500/9245 utterances processed\n",
"3550/9245 utterances processed\n",
"3600/9245 utterances processed\n",
"3650/9245 utterances processed\n",
"3700/9245 utterances processed\n",
"3750/9245 utterances processed\n",
"3800/9245 utterances processed\n",
"3850/9245 utterances processed\n",
"3900/9245 utterances processed\n",
"3950/9245 utterances processed\n",
"4000/9245 utterances processed\n",
"4050/9245 utterances processed\n",
"4100/9245 utterances processed\n",
"4150/9245 utterances processed\n",
"4200/9245 utterances processed\n",
"4250/9245 utterances processed\n",
"4300/9245 utterances processed\n",
"4350/9245 utterances processed\n",
"4400/9245 utterances processed\n",
"4450/9245 utterances processed\n",
"4500/9245 utterances processed\n",
"4550/9245 utterances processed\n",
"4600/9245 utterances processed\n",
"4650/9245 utterances processed\n",
"4700/9245 utterances processed\n",
"4750/9245 utterances processed\n",
"4800/9245 utterances processed\n",
"4850/9245 utterances processed\n",
"4900/9245 utterances processed\n",
"4950/9245 utterances processed\n",
"5000/9245 utterances processed\n",
"5050/9245 utterances processed\n",
"5100/9245 utterances processed\n",
"5150/9245 utterances processed\n",
"5200/9245 utterances processed\n",
"5250/9245 utterances processed\n",
"5300/9245 utterances processed\n",
"5350/9245 utterances processed\n",
"5400/9245 utterances processed\n",
"5450/9245 utterances processed\n",
"5500/9245 utterances processed\n",
"5550/9245 utterances processed\n",
"5600/9245 utterances processed\n",
"5650/9245 utterances processed\n",
"5700/9245 utterances processed\n",
"5750/9245 utterances processed\n",
"5800/9245 utterances processed\n",
"5850/9245 utterances processed\n",
"5900/9245 utterances processed\n",
"5950/9245 utterances processed\n",
"6000/9245 utterances processed\n",
"6050/9245 utterances processed\n",
"6100/9245 utterances processed\n",
"6150/9245 utterances processed\n",
"6200/9245 utterances processed\n",
"6250/9245 utterances processed\n",
"6300/9245 utterances processed\n",
"6350/9245 utterances processed\n",
"6400/9245 utterances processed\n",
"6450/9245 utterances processed\n",
"6500/9245 utterances processed\n",
"6550/9245 utterances processed\n",
"6600/9245 utterances processed\n",
"6650/9245 utterances processed\n",
"6700/9245 utterances processed\n",
"6750/9245 utterances processed\n",
"6800/9245 utterances processed\n",
"6850/9245 utterances processed\n",
"6900/9245 utterances processed\n",
"6950/9245 utterances processed\n",
"7000/9245 utterances processed\n",
"7050/9245 utterances processed\n",
"7100/9245 utterances processed\n",
"7150/9245 utterances processed\n",
"7200/9245 utterances processed\n",
"7250/9245 utterances processed\n",
"7300/9245 utterances processed\n",
"7350/9245 utterances processed\n",
"7400/9245 utterances processed\n",
"7450/9245 utterances processed\n",
"7500/9245 utterances processed\n",
"7550/9245 utterances processed\n",
"7600/9245 utterances processed\n",
"7650/9245 utterances processed\n",
"7700/9245 utterances processed\n",
"7750/9245 utterances processed\n",
"7800/9245 utterances processed\n",
"7850/9245 utterances processed\n",
"7900/9245 utterances processed\n",
"7950/9245 utterances processed\n",
"8000/9245 utterances processed\n",
"8050/9245 utterances processed\n",
"8100/9245 utterances processed\n",
"8150/9245 utterances processed\n",
"8200/9245 utterances processed\n",
"8250/9245 utterances processed\n",
"8300/9245 utterances processed\n",
"8350/9245 utterances processed\n",
"8400/9245 utterances processed\n",
"8450/9245 utterances processed\n",
"8500/9245 utterances processed\n",
"8550/9245 utterances processed\n",
"8600/9245 utterances processed\n",
"8650/9245 utterances processed\n",
"8700/9245 utterances processed\n",
"8750/9245 utterances processed\n",
"8800/9245 utterances processed\n",
"8850/9245 utterances processed\n",
"8900/9245 utterances processed\n",
"8950/9245 utterances processed\n",
"9000/9245 utterances processed\n",
"9050/9245 utterances processed\n",
"9100/9245 utterances processed\n",
"9150/9245 utterances processed\n",
"9200/9245 utterances processed\n",
"9245/9245 utterances processed\n"
]
}
],
"source": [
"no_bots_phab_corpus = parser.transform(no_bots_phab_corpus)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.21"
}
},
"nbformat": 4,
"nbformat_minor": 5
}