{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "fa71c616-e22d-4f6e-9599-34d85c05179b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n", " warnings.warn(\"Can't initialize NVML\")\n" ] } ], "source": [ "import convokit\n", "import pandas as pd\n", "from convokit import Corpus, download" ] }, { "cell_type": "code", "execution_count": 24, "id": "e9c7100b-308c-4a57-bc53-91318d081cbb", "metadata": {}, "outputs": [], "source": [ "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n", "phab_df = pd.read_csv(phab_path)" ] }, { "cell_type": "code", "execution_count": 26, "id": "101909ac-4794-4d35-aa8d-76ea301ef397", "metadata": {}, "outputs": [], "source": [ "#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n", "phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n", "#cleaning df\n", "phab_df['id'] = phab_df.index + 1\n", "#may have to build out the reply_to column \n", "phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n", "phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n", "\n", "phab_df = phab_df.rename(columns={\n", " 'date_created': 'timestamp',\n", " 'comment_text': 'text',\n", " 'AuthorPHID': 'speaker',\n", " 'TaskPHID': 'conversation_id',\n", " 'WMFaffil':'meta.affil',\n", " 'isGerrit': 'meta.gerrit'\n", "})\n", "# after 11-1-2012 before 11-1-2013\n", "filtered_phab_df = phab_df[(phab_df['timestamp'] < 1383264000) & (phab_df['timestamp'] > 1351728000)]" ] }, { "cell_type": "code", "execution_count": 27, "id": "7f1aed21-3dbb-40f4-b275-461c35e5d07c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "10277it [00:00, 30651.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 20708 has been casted to a string.\n", "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 28751 has been casted to a string.\n", "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 29804 has been casted to a string.\n", "\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 31861 has been casted to a string.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "phab_corpus = Corpus.from_pandas(filtered_phab_df)" ] }, { "cell_type": "code", "execution_count": 28, "id": "f0d1f727-c16f-4b61-b83a-e462c15bbcb5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Speakers: 236\n", "Number of Utterances: 10277\n", "Number of Conversations: 2205\n" ] } ], "source": [ "phab_corpus.print_summary_stats()" ] }, { "cell_type": "code", "execution_count": 35, "id": "e2a9cb88-876f-416b-ba9e-665170b24aa9", "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'This BackendMapper does not have an entry for the meta with id utterance_711.'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m \u001b[43mCorpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mphab_corpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n", "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:647\u001b[0m, in \u001b[0;36mCorpus.filter_utterances\u001b[0;34m(source_corpus, selector)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfilter_utterances\u001b[39m(source_corpus: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCorpus\u001b[39m\u001b[38;5;124m\"\u001b[39m, selector: Callable[[Utterance], \u001b[38;5;28mbool\u001b[39m]):\n\u001b[1;32m 637\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 638\u001b[0m \u001b[38;5;124;03m Returns a new corpus that includes only a subset of Utterances from the source Corpus. This filtering provides no\u001b[39;00m\n\u001b[1;32m 639\u001b[0m \u001b[38;5;124;03m guarantees with regard to maintaining conversational integrity and should be used with care.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[38;5;124;03m :return: a new Corpus with a subset of the Utterances\u001b[39;00m\n\u001b[1;32m 646\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 647\u001b[0m utts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msource_corpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 648\u001b[0m new_corpus \u001b[38;5;241m=\u001b[39m Corpus(utterances\u001b[38;5;241m=\u001b[39mutts)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m convo \u001b[38;5;129;01min\u001b[39;00m new_corpus\u001b[38;5;241m.\u001b[39miter_conversations():\n", "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:444\u001b[0m, in \u001b[0;36mCorpus.iter_utterances\u001b[0;34m(self, selector)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 437\u001b[0m \u001b[38;5;124;03mGet utterances in the Corpus, with an optional selector that filters for Utterances that should be included.\u001b[39;00m\n\u001b[1;32m 438\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;124;03m:return: a generator of Utterances\u001b[39;00m\n\u001b[1;32m 442\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mutterances\u001b[38;5;241m.\u001b[39mvalues():\n\u001b[0;32m--> 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m v\n", "Cell \u001b[0;32mIn[35], line 1\u001b[0m, in \u001b[0;36m\u001b[0;34m(utt)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m Corpus\u001b[38;5;241m.\u001b[39mfilter_utterances(phab_corpus, \u001b[38;5;28;01mlambda\u001b[39;00m utt: \u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n", "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/convoKitMeta.py:37\u001b[0m, in \u001b[0;36mConvoKitMeta.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, item):\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# in DB mode, metadata field mutation would not be updated. (ex. mutating dict/list metadata fields)\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;66;03m# we align MEM mode behavior and DB mode by making deepcopy of metadata fields, so mutation no longer\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# affect corpus metadata backend, but only acting on the copy of it.\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmeta\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_index\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobj_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 40\u001b[0m immutable_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28mcomplex\u001b[39m, \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mfrozenset\u001b[39m)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, immutable_types):\n", "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/backendMapper.py:173\u001b[0m, in \u001b[0;36mMemMapper.get_data\u001b[0;34m(self, component_type, component_id, property_name, index)\u001b[0m\n\u001b[1;32m 171\u001b[0m collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_collection(component_type)\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m component_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m collection:\n\u001b[0;32m--> 173\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\n\u001b[1;32m 174\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis BackendMapper does not have an entry for the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 175\u001b[0m )\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m property_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m collection[component_id]\n", "\u001b[0;31mKeyError\u001b[0m: 'This BackendMapper does not have an entry for the meta with id utterance_711.'" ] } ], "source": [ "no_bots_phab_corpus = Corpus.filter_utterances(phab_corpus, lambda utt: utt.meta['gerrit'] != True)\n", "no_bots_phab_corpus.print_summary_stats()" ] }, { "cell_type": "code", "execution_count": 49, "id": "dde1dde5-15b5-4291-997f-a0d772e5ecbc", "metadata": {}, "outputs": [], "source": [ "#looking at how language use differs between the two groups \n", "from convokit.text_processing import TextParser\n", "parser = TextParser(input_field='text')" ] }, { "cell_type": "code", "execution_count": 50, "id": "146d6526-b80f-4981-af2d-277b90852d5f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "050/9245 utterances processed\n", "100/9245 utterances processed\n", "150/9245 utterances processed\n", "200/9245 utterances processed\n", "250/9245 utterances processed\n", "300/9245 utterances processed\n", "350/9245 utterances processed\n", "400/9245 utterances processed\n", "450/9245 utterances processed\n", "500/9245 utterances processed\n", "550/9245 utterances processed\n", "600/9245 utterances processed\n", "650/9245 utterances processed\n", "700/9245 utterances processed\n", "750/9245 utterances processed\n", "800/9245 utterances processed\n", "850/9245 utterances processed\n", "900/9245 utterances processed\n", "950/9245 utterances processed\n", "1000/9245 utterances processed\n", "1050/9245 utterances processed\n", "1100/9245 utterances processed\n", "1150/9245 utterances processed\n", "1200/9245 utterances processed\n", "1250/9245 utterances processed\n", "1300/9245 utterances processed\n", "1350/9245 utterances processed\n", "1400/9245 utterances processed\n", "1450/9245 utterances processed\n", "1500/9245 utterances processed\n", "1550/9245 utterances processed\n", "1600/9245 utterances processed\n", "1650/9245 utterances processed\n", "1700/9245 utterances processed\n", "1750/9245 utterances processed\n", "1800/9245 utterances processed\n", "1850/9245 utterances processed\n", "1900/9245 utterances processed\n", "1950/9245 utterances processed\n", "2000/9245 utterances processed\n", "2050/9245 utterances processed\n", "2100/9245 utterances processed\n", "2150/9245 utterances processed\n", "2200/9245 utterances processed\n", "2250/9245 utterances processed\n", "2300/9245 utterances processed\n", "2350/9245 utterances processed\n", "2400/9245 utterances processed\n", "2450/9245 utterances processed\n", "2500/9245 utterances processed\n", "2550/9245 utterances processed\n", "2600/9245 utterances processed\n", "2650/9245 utterances processed\n", "2700/9245 utterances processed\n", "2750/9245 utterances processed\n", "2800/9245 utterances processed\n", "2850/9245 utterances processed\n", "2900/9245 utterances processed\n", "2950/9245 utterances processed\n", "3000/9245 utterances processed\n", "3050/9245 utterances processed\n", "3100/9245 utterances processed\n", "3150/9245 utterances processed\n", "3200/9245 utterances processed\n", "3250/9245 utterances processed\n", "3300/9245 utterances processed\n", "3350/9245 utterances processed\n", "3400/9245 utterances processed\n", "3450/9245 utterances processed\n", "3500/9245 utterances processed\n", "3550/9245 utterances processed\n", "3600/9245 utterances processed\n", "3650/9245 utterances processed\n", "3700/9245 utterances processed\n", "3750/9245 utterances processed\n", "3800/9245 utterances processed\n", "3850/9245 utterances processed\n", "3900/9245 utterances processed\n", "3950/9245 utterances processed\n", "4000/9245 utterances processed\n", "4050/9245 utterances processed\n", "4100/9245 utterances processed\n", "4150/9245 utterances processed\n", "4200/9245 utterances processed\n", "4250/9245 utterances processed\n", "4300/9245 utterances processed\n", "4350/9245 utterances processed\n", "4400/9245 utterances processed\n", "4450/9245 utterances processed\n", "4500/9245 utterances processed\n", "4550/9245 utterances processed\n", "4600/9245 utterances processed\n", "4650/9245 utterances processed\n", "4700/9245 utterances processed\n", "4750/9245 utterances processed\n", "4800/9245 utterances processed\n", "4850/9245 utterances processed\n", "4900/9245 utterances processed\n", "4950/9245 utterances processed\n", "5000/9245 utterances processed\n", "5050/9245 utterances processed\n", "5100/9245 utterances processed\n", "5150/9245 utterances processed\n", "5200/9245 utterances processed\n", "5250/9245 utterances processed\n", "5300/9245 utterances processed\n", "5350/9245 utterances processed\n", "5400/9245 utterances processed\n", "5450/9245 utterances processed\n", "5500/9245 utterances processed\n", "5550/9245 utterances processed\n", "5600/9245 utterances processed\n", "5650/9245 utterances processed\n", "5700/9245 utterances processed\n", "5750/9245 utterances processed\n", "5800/9245 utterances processed\n", "5850/9245 utterances processed\n", "5900/9245 utterances processed\n", "5950/9245 utterances processed\n", "6000/9245 utterances processed\n", "6050/9245 utterances processed\n", "6100/9245 utterances processed\n", "6150/9245 utterances processed\n", "6200/9245 utterances processed\n", "6250/9245 utterances processed\n", "6300/9245 utterances processed\n", "6350/9245 utterances processed\n", "6400/9245 utterances processed\n", "6450/9245 utterances processed\n", "6500/9245 utterances processed\n", "6550/9245 utterances processed\n", "6600/9245 utterances processed\n", "6650/9245 utterances processed\n", "6700/9245 utterances processed\n", "6750/9245 utterances processed\n", "6800/9245 utterances processed\n", "6850/9245 utterances processed\n", "6900/9245 utterances processed\n", "6950/9245 utterances processed\n", "7000/9245 utterances processed\n", "7050/9245 utterances processed\n", "7100/9245 utterances processed\n", "7150/9245 utterances processed\n", "7200/9245 utterances processed\n", "7250/9245 utterances processed\n", "7300/9245 utterances processed\n", "7350/9245 utterances processed\n", "7400/9245 utterances processed\n", "7450/9245 utterances processed\n", "7500/9245 utterances processed\n", "7550/9245 utterances processed\n", "7600/9245 utterances processed\n", "7650/9245 utterances processed\n", "7700/9245 utterances processed\n", "7750/9245 utterances processed\n", "7800/9245 utterances processed\n", "7850/9245 utterances processed\n", "7900/9245 utterances processed\n", "7950/9245 utterances processed\n", "8000/9245 utterances processed\n", "8050/9245 utterances processed\n", "8100/9245 utterances processed\n", "8150/9245 utterances processed\n", "8200/9245 utterances processed\n", "8250/9245 utterances processed\n", "8300/9245 utterances processed\n", "8350/9245 utterances processed\n", "8400/9245 utterances processed\n", "8450/9245 utterances processed\n", "8500/9245 utterances processed\n", "8550/9245 utterances processed\n", "8600/9245 utterances processed\n", "8650/9245 utterances processed\n", "8700/9245 utterances processed\n", "8750/9245 utterances processed\n", "8800/9245 utterances processed\n", "8850/9245 utterances processed\n", "8900/9245 utterances processed\n", "8950/9245 utterances processed\n", "9000/9245 utterances processed\n", "9050/9245 utterances processed\n", "9100/9245 utterances processed\n", "9150/9245 utterances processed\n", "9200/9245 utterances processed\n", "9245/9245 utterances processed\n" ] } ], "source": [ "no_bots_phab_corpus = parser.transform(no_bots_phab_corpus)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 5 }