updated convokit modeling
This commit is contained in:
parent
4f6e190d18
commit
28df3eb729
@ -1,17 +0,0 @@
|
|||||||
1. SSH tunnel from your workstation using the following command:
|
|
||||||
|
|
||||||
ssh -N -L 8787:n3439:56597 mjilg@klone.hyak.uw.edu
|
|
||||||
|
|
||||||
and point your web browser to http://localhost:8787
|
|
||||||
|
|
||||||
2. log in to RStudio Server using the following credentials:
|
|
||||||
|
|
||||||
user: mjilg
|
|
||||||
password: wO04FrVKQP5bSLRuzEi5
|
|
||||||
|
|
||||||
When done using RStudio Server, terminate the job by:
|
|
||||||
|
|
||||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
|
||||||
2. Issue the following command on the login node:
|
|
||||||
|
|
||||||
scancel -f 24539959
|
|
@ -0,0 +1,381 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "fa71c616-e22d-4f6e-9599-34d85c05179b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n",
|
||||||
|
" warnings.warn(\"Can't initialize NVML\")\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import convokit\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from convokit import Corpus, download"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"id": "e9c7100b-308c-4a57-bc53-91318d081cbb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n",
|
||||||
|
"phab_df = pd.read_csv(phab_path)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"id": "101909ac-4794-4d35-aa8d-76ea301ef397",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
|
||||||
|
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
|
||||||
|
"#cleaning df\n",
|
||||||
|
"phab_df['id'] = phab_df.index + 1\n",
|
||||||
|
"#may have to build out the reply_to column \n",
|
||||||
|
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
|
||||||
|
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
|
||||||
|
"\n",
|
||||||
|
"phab_df = phab_df.rename(columns={\n",
|
||||||
|
" 'date_created': 'timestamp',\n",
|
||||||
|
" 'comment_text': 'text',\n",
|
||||||
|
" 'AuthorPHID': 'speaker',\n",
|
||||||
|
" 'TaskPHID': 'conversation_id',\n",
|
||||||
|
" 'WMFaffil':'meta.affil',\n",
|
||||||
|
" 'isGerrit': 'meta.gerrit'\n",
|
||||||
|
"})\n",
|
||||||
|
"# after 11-1-2012 before 11-1-2013\n",
|
||||||
|
"filtered_phab_df = phab_df[(phab_df['timestamp'] < 1383264000) & (phab_df['timestamp'] > 1351728000)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"id": "7f1aed21-3dbb-40f4-b275-461c35e5d07c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"10277it [00:00, 30651.56it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 20708 has been casted to a string.\n",
|
||||||
|
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 28751 has been casted to a string.\n",
|
||||||
|
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 29804 has been casted to a string.\n",
|
||||||
|
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 31861 has been casted to a string.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"phab_corpus = Corpus.from_pandas(filtered_phab_df)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"id": "f0d1f727-c16f-4b61-b83a-e462c15bbcb5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Number of Speakers: 236\n",
|
||||||
|
"Number of Utterances: 10277\n",
|
||||||
|
"Number of Conversations: 2205\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"phab_corpus.print_summary_stats()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"id": "e2a9cb88-876f-416b-ba9e-665170b24aa9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "KeyError",
|
||||||
|
"evalue": "'This BackendMapper does not have an entry for the meta with id utterance_711.'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m \u001b[43mCorpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mphab_corpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n",
|
||||||
|
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:647\u001b[0m, in \u001b[0;36mCorpus.filter_utterances\u001b[0;34m(source_corpus, selector)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfilter_utterances\u001b[39m(source_corpus: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCorpus\u001b[39m\u001b[38;5;124m\"\u001b[39m, selector: Callable[[Utterance], \u001b[38;5;28mbool\u001b[39m]):\n\u001b[1;32m 637\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 638\u001b[0m \u001b[38;5;124;03m Returns a new corpus that includes only a subset of Utterances from the source Corpus. This filtering provides no\u001b[39;00m\n\u001b[1;32m 639\u001b[0m \u001b[38;5;124;03m guarantees with regard to maintaining conversational integrity and should be used with care.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[38;5;124;03m :return: a new Corpus with a subset of the Utterances\u001b[39;00m\n\u001b[1;32m 646\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 647\u001b[0m utts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msource_corpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 648\u001b[0m new_corpus \u001b[38;5;241m=\u001b[39m Corpus(utterances\u001b[38;5;241m=\u001b[39mutts)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m convo \u001b[38;5;129;01min\u001b[39;00m new_corpus\u001b[38;5;241m.\u001b[39miter_conversations():\n",
|
||||||
|
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:444\u001b[0m, in \u001b[0;36mCorpus.iter_utterances\u001b[0;34m(self, selector)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 437\u001b[0m \u001b[38;5;124;03mGet utterances in the Corpus, with an optional selector that filters for Utterances that should be included.\u001b[39;00m\n\u001b[1;32m 438\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;124;03m:return: a generator of Utterances\u001b[39;00m\n\u001b[1;32m 442\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mutterances\u001b[38;5;241m.\u001b[39mvalues():\n\u001b[0;32m--> 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m v\n",
|
||||||
|
"Cell \u001b[0;32mIn[35], line 1\u001b[0m, in \u001b[0;36m<lambda>\u001b[0;34m(utt)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m Corpus\u001b[38;5;241m.\u001b[39mfilter_utterances(phab_corpus, \u001b[38;5;28;01mlambda\u001b[39;00m utt: \u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n",
|
||||||
|
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/convoKitMeta.py:37\u001b[0m, in \u001b[0;36mConvoKitMeta.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, item):\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# in DB mode, metadata field mutation would not be updated. (ex. mutating dict/list metadata fields)\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;66;03m# we align MEM mode behavior and DB mode by making deepcopy of metadata fields, so mutation no longer\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# affect corpus metadata backend, but only acting on the copy of it.\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmeta\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_index\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobj_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 40\u001b[0m immutable_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28mcomplex\u001b[39m, \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mfrozenset\u001b[39m)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, immutable_types):\n",
|
||||||
|
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/backendMapper.py:173\u001b[0m, in \u001b[0;36mMemMapper.get_data\u001b[0;34m(self, component_type, component_id, property_name, index)\u001b[0m\n\u001b[1;32m 171\u001b[0m collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_collection(component_type)\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m component_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m collection:\n\u001b[0;32m--> 173\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\n\u001b[1;32m 174\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis BackendMapper does not have an entry for the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 175\u001b[0m )\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m property_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m collection[component_id]\n",
|
||||||
|
"\u001b[0;31mKeyError\u001b[0m: 'This BackendMapper does not have an entry for the meta with id utterance_711.'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"no_bots_phab_corpus = Corpus.filter_utterances(phab_corpus, lambda utt: utt.meta['gerrit'] != True)\n",
|
||||||
|
"no_bots_phab_corpus.print_summary_stats()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 49,
|
||||||
|
"id": "dde1dde5-15b5-4291-997f-a0d772e5ecbc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#looking at how language use differs between the two groups \n",
|
||||||
|
"from convokit.text_processing import TextParser\n",
|
||||||
|
"parser = TextParser(input_field='text')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 50,
|
||||||
|
"id": "146d6526-b80f-4981-af2d-277b90852d5f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"050/9245 utterances processed\n",
|
||||||
|
"100/9245 utterances processed\n",
|
||||||
|
"150/9245 utterances processed\n",
|
||||||
|
"200/9245 utterances processed\n",
|
||||||
|
"250/9245 utterances processed\n",
|
||||||
|
"300/9245 utterances processed\n",
|
||||||
|
"350/9245 utterances processed\n",
|
||||||
|
"400/9245 utterances processed\n",
|
||||||
|
"450/9245 utterances processed\n",
|
||||||
|
"500/9245 utterances processed\n",
|
||||||
|
"550/9245 utterances processed\n",
|
||||||
|
"600/9245 utterances processed\n",
|
||||||
|
"650/9245 utterances processed\n",
|
||||||
|
"700/9245 utterances processed\n",
|
||||||
|
"750/9245 utterances processed\n",
|
||||||
|
"800/9245 utterances processed\n",
|
||||||
|
"850/9245 utterances processed\n",
|
||||||
|
"900/9245 utterances processed\n",
|
||||||
|
"950/9245 utterances processed\n",
|
||||||
|
"1000/9245 utterances processed\n",
|
||||||
|
"1050/9245 utterances processed\n",
|
||||||
|
"1100/9245 utterances processed\n",
|
||||||
|
"1150/9245 utterances processed\n",
|
||||||
|
"1200/9245 utterances processed\n",
|
||||||
|
"1250/9245 utterances processed\n",
|
||||||
|
"1300/9245 utterances processed\n",
|
||||||
|
"1350/9245 utterances processed\n",
|
||||||
|
"1400/9245 utterances processed\n",
|
||||||
|
"1450/9245 utterances processed\n",
|
||||||
|
"1500/9245 utterances processed\n",
|
||||||
|
"1550/9245 utterances processed\n",
|
||||||
|
"1600/9245 utterances processed\n",
|
||||||
|
"1650/9245 utterances processed\n",
|
||||||
|
"1700/9245 utterances processed\n",
|
||||||
|
"1750/9245 utterances processed\n",
|
||||||
|
"1800/9245 utterances processed\n",
|
||||||
|
"1850/9245 utterances processed\n",
|
||||||
|
"1900/9245 utterances processed\n",
|
||||||
|
"1950/9245 utterances processed\n",
|
||||||
|
"2000/9245 utterances processed\n",
|
||||||
|
"2050/9245 utterances processed\n",
|
||||||
|
"2100/9245 utterances processed\n",
|
||||||
|
"2150/9245 utterances processed\n",
|
||||||
|
"2200/9245 utterances processed\n",
|
||||||
|
"2250/9245 utterances processed\n",
|
||||||
|
"2300/9245 utterances processed\n",
|
||||||
|
"2350/9245 utterances processed\n",
|
||||||
|
"2400/9245 utterances processed\n",
|
||||||
|
"2450/9245 utterances processed\n",
|
||||||
|
"2500/9245 utterances processed\n",
|
||||||
|
"2550/9245 utterances processed\n",
|
||||||
|
"2600/9245 utterances processed\n",
|
||||||
|
"2650/9245 utterances processed\n",
|
||||||
|
"2700/9245 utterances processed\n",
|
||||||
|
"2750/9245 utterances processed\n",
|
||||||
|
"2800/9245 utterances processed\n",
|
||||||
|
"2850/9245 utterances processed\n",
|
||||||
|
"2900/9245 utterances processed\n",
|
||||||
|
"2950/9245 utterances processed\n",
|
||||||
|
"3000/9245 utterances processed\n",
|
||||||
|
"3050/9245 utterances processed\n",
|
||||||
|
"3100/9245 utterances processed\n",
|
||||||
|
"3150/9245 utterances processed\n",
|
||||||
|
"3200/9245 utterances processed\n",
|
||||||
|
"3250/9245 utterances processed\n",
|
||||||
|
"3300/9245 utterances processed\n",
|
||||||
|
"3350/9245 utterances processed\n",
|
||||||
|
"3400/9245 utterances processed\n",
|
||||||
|
"3450/9245 utterances processed\n",
|
||||||
|
"3500/9245 utterances processed\n",
|
||||||
|
"3550/9245 utterances processed\n",
|
||||||
|
"3600/9245 utterances processed\n",
|
||||||
|
"3650/9245 utterances processed\n",
|
||||||
|
"3700/9245 utterances processed\n",
|
||||||
|
"3750/9245 utterances processed\n",
|
||||||
|
"3800/9245 utterances processed\n",
|
||||||
|
"3850/9245 utterances processed\n",
|
||||||
|
"3900/9245 utterances processed\n",
|
||||||
|
"3950/9245 utterances processed\n",
|
||||||
|
"4000/9245 utterances processed\n",
|
||||||
|
"4050/9245 utterances processed\n",
|
||||||
|
"4100/9245 utterances processed\n",
|
||||||
|
"4150/9245 utterances processed\n",
|
||||||
|
"4200/9245 utterances processed\n",
|
||||||
|
"4250/9245 utterances processed\n",
|
||||||
|
"4300/9245 utterances processed\n",
|
||||||
|
"4350/9245 utterances processed\n",
|
||||||
|
"4400/9245 utterances processed\n",
|
||||||
|
"4450/9245 utterances processed\n",
|
||||||
|
"4500/9245 utterances processed\n",
|
||||||
|
"4550/9245 utterances processed\n",
|
||||||
|
"4600/9245 utterances processed\n",
|
||||||
|
"4650/9245 utterances processed\n",
|
||||||
|
"4700/9245 utterances processed\n",
|
||||||
|
"4750/9245 utterances processed\n",
|
||||||
|
"4800/9245 utterances processed\n",
|
||||||
|
"4850/9245 utterances processed\n",
|
||||||
|
"4900/9245 utterances processed\n",
|
||||||
|
"4950/9245 utterances processed\n",
|
||||||
|
"5000/9245 utterances processed\n",
|
||||||
|
"5050/9245 utterances processed\n",
|
||||||
|
"5100/9245 utterances processed\n",
|
||||||
|
"5150/9245 utterances processed\n",
|
||||||
|
"5200/9245 utterances processed\n",
|
||||||
|
"5250/9245 utterances processed\n",
|
||||||
|
"5300/9245 utterances processed\n",
|
||||||
|
"5350/9245 utterances processed\n",
|
||||||
|
"5400/9245 utterances processed\n",
|
||||||
|
"5450/9245 utterances processed\n",
|
||||||
|
"5500/9245 utterances processed\n",
|
||||||
|
"5550/9245 utterances processed\n",
|
||||||
|
"5600/9245 utterances processed\n",
|
||||||
|
"5650/9245 utterances processed\n",
|
||||||
|
"5700/9245 utterances processed\n",
|
||||||
|
"5750/9245 utterances processed\n",
|
||||||
|
"5800/9245 utterances processed\n",
|
||||||
|
"5850/9245 utterances processed\n",
|
||||||
|
"5900/9245 utterances processed\n",
|
||||||
|
"5950/9245 utterances processed\n",
|
||||||
|
"6000/9245 utterances processed\n",
|
||||||
|
"6050/9245 utterances processed\n",
|
||||||
|
"6100/9245 utterances processed\n",
|
||||||
|
"6150/9245 utterances processed\n",
|
||||||
|
"6200/9245 utterances processed\n",
|
||||||
|
"6250/9245 utterances processed\n",
|
||||||
|
"6300/9245 utterances processed\n",
|
||||||
|
"6350/9245 utterances processed\n",
|
||||||
|
"6400/9245 utterances processed\n",
|
||||||
|
"6450/9245 utterances processed\n",
|
||||||
|
"6500/9245 utterances processed\n",
|
||||||
|
"6550/9245 utterances processed\n",
|
||||||
|
"6600/9245 utterances processed\n",
|
||||||
|
"6650/9245 utterances processed\n",
|
||||||
|
"6700/9245 utterances processed\n",
|
||||||
|
"6750/9245 utterances processed\n",
|
||||||
|
"6800/9245 utterances processed\n",
|
||||||
|
"6850/9245 utterances processed\n",
|
||||||
|
"6900/9245 utterances processed\n",
|
||||||
|
"6950/9245 utterances processed\n",
|
||||||
|
"7000/9245 utterances processed\n",
|
||||||
|
"7050/9245 utterances processed\n",
|
||||||
|
"7100/9245 utterances processed\n",
|
||||||
|
"7150/9245 utterances processed\n",
|
||||||
|
"7200/9245 utterances processed\n",
|
||||||
|
"7250/9245 utterances processed\n",
|
||||||
|
"7300/9245 utterances processed\n",
|
||||||
|
"7350/9245 utterances processed\n",
|
||||||
|
"7400/9245 utterances processed\n",
|
||||||
|
"7450/9245 utterances processed\n",
|
||||||
|
"7500/9245 utterances processed\n",
|
||||||
|
"7550/9245 utterances processed\n",
|
||||||
|
"7600/9245 utterances processed\n",
|
||||||
|
"7650/9245 utterances processed\n",
|
||||||
|
"7700/9245 utterances processed\n",
|
||||||
|
"7750/9245 utterances processed\n",
|
||||||
|
"7800/9245 utterances processed\n",
|
||||||
|
"7850/9245 utterances processed\n",
|
||||||
|
"7900/9245 utterances processed\n",
|
||||||
|
"7950/9245 utterances processed\n",
|
||||||
|
"8000/9245 utterances processed\n",
|
||||||
|
"8050/9245 utterances processed\n",
|
||||||
|
"8100/9245 utterances processed\n",
|
||||||
|
"8150/9245 utterances processed\n",
|
||||||
|
"8200/9245 utterances processed\n",
|
||||||
|
"8250/9245 utterances processed\n",
|
||||||
|
"8300/9245 utterances processed\n",
|
||||||
|
"8350/9245 utterances processed\n",
|
||||||
|
"8400/9245 utterances processed\n",
|
||||||
|
"8450/9245 utterances processed\n",
|
||||||
|
"8500/9245 utterances processed\n",
|
||||||
|
"8550/9245 utterances processed\n",
|
||||||
|
"8600/9245 utterances processed\n",
|
||||||
|
"8650/9245 utterances processed\n",
|
||||||
|
"8700/9245 utterances processed\n",
|
||||||
|
"8750/9245 utterances processed\n",
|
||||||
|
"8800/9245 utterances processed\n",
|
||||||
|
"8850/9245 utterances processed\n",
|
||||||
|
"8900/9245 utterances processed\n",
|
||||||
|
"8950/9245 utterances processed\n",
|
||||||
|
"9000/9245 utterances processed\n",
|
||||||
|
"9050/9245 utterances processed\n",
|
||||||
|
"9100/9245 utterances processed\n",
|
||||||
|
"9150/9245 utterances processed\n",
|
||||||
|
"9200/9245 utterances processed\n",
|
||||||
|
"9245/9245 utterances processed\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"no_bots_phab_corpus = parser.transform(no_bots_phab_corpus)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.21"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
381
text_analysis/case1/ve_phab_convos.ipynb
Normal file
381
text_analysis/case1/ve_phab_convos.ipynb
Normal file
@ -0,0 +1,381 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "fa71c616-e22d-4f6e-9599-34d85c05179b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n",
|
||||||
|
" warnings.warn(\"Can't initialize NVML\")\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import convokit\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from convokit import Corpus, download"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"id": "e9c7100b-308c-4a57-bc53-91318d081cbb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n",
|
||||||
|
"phab_df = pd.read_csv(phab_path)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"id": "101909ac-4794-4d35-aa8d-76ea301ef397",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
|
||||||
|
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
|
||||||
|
"#cleaning df\n",
|
||||||
|
"phab_df['id'] = phab_df.index + 1\n",
|
||||||
|
"#may have to build out the reply_to column \n",
|
||||||
|
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
|
||||||
|
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
|
||||||
|
"\n",
|
||||||
|
"phab_df = phab_df.rename(columns={\n",
|
||||||
|
" 'date_created': 'timestamp',\n",
|
||||||
|
" 'comment_text': 'text',\n",
|
||||||
|
" 'AuthorPHID': 'speaker',\n",
|
||||||
|
" 'TaskPHID': 'conversation_id',\n",
|
||||||
|
" 'WMFaffil':'meta.affil',\n",
|
||||||
|
" 'isGerrit': 'meta.gerrit'\n",
|
||||||
|
"})\n",
|
||||||
|
"# after 11-1-2012 before 11-1-2013\n",
|
||||||
|
"filtered_phab_df = phab_df[(phab_df['timestamp'] < 1383264000) & (phab_df['timestamp'] > 1351728000)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"id": "7f1aed21-3dbb-40f4-b275-461c35e5d07c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"10277it [00:00, 30651.56it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 20708 has been casted to a string.\n",
|
||||||
|
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 28751 has been casted to a string.\n",
|
||||||
|
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 29804 has been casted to a string.\n",
|
||||||
|
"\u001b[91mWARNING: \u001b[0mUtterance text must be a string: text of utterance with ID: 31861 has been casted to a string.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"phab_corpus = Corpus.from_pandas(filtered_phab_df)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"id": "f0d1f727-c16f-4b61-b83a-e462c15bbcb5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Number of Speakers: 236\n",
|
||||||
|
"Number of Utterances: 10277\n",
|
||||||
|
"Number of Conversations: 2205\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"phab_corpus.print_summary_stats()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"id": "e2a9cb88-876f-416b-ba9e-665170b24aa9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "KeyError",
|
||||||
|
"evalue": "'This BackendMapper does not have an entry for the meta with id utterance_711.'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[0;32mIn[35], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m \u001b[43mCorpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mphab_corpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n",
|
||||||
|
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:647\u001b[0m, in \u001b[0;36mCorpus.filter_utterances\u001b[0;34m(source_corpus, selector)\u001b[0m\n\u001b[1;32m 635\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfilter_utterances\u001b[39m(source_corpus: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCorpus\u001b[39m\u001b[38;5;124m\"\u001b[39m, selector: Callable[[Utterance], \u001b[38;5;28mbool\u001b[39m]):\n\u001b[1;32m 637\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 638\u001b[0m \u001b[38;5;124;03m Returns a new corpus that includes only a subset of Utterances from the source Corpus. This filtering provides no\u001b[39;00m\n\u001b[1;32m 639\u001b[0m \u001b[38;5;124;03m guarantees with regard to maintaining conversational integrity and should be used with care.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[38;5;124;03m :return: a new Corpus with a subset of the Utterances\u001b[39;00m\n\u001b[1;32m 646\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 647\u001b[0m utts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msource_corpus\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miter_utterances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 648\u001b[0m new_corpus \u001b[38;5;241m=\u001b[39m Corpus(utterances\u001b[38;5;241m=\u001b[39mutts)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m convo \u001b[38;5;129;01min\u001b[39;00m new_corpus\u001b[38;5;241m.\u001b[39miter_conversations():\n",
|
||||||
|
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/corpus.py:444\u001b[0m, in \u001b[0;36mCorpus.iter_utterances\u001b[0;34m(self, selector)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 437\u001b[0m \u001b[38;5;124;03mGet utterances in the Corpus, with an optional selector that filters for Utterances that should be included.\u001b[39;00m\n\u001b[1;32m 438\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;124;03m:return: a generator of Utterances\u001b[39;00m\n\u001b[1;32m 442\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mutterances\u001b[38;5;241m.\u001b[39mvalues():\n\u001b[0;32m--> 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mselector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m v\n",
|
||||||
|
"Cell \u001b[0;32mIn[35], line 1\u001b[0m, in \u001b[0;36m<lambda>\u001b[0;34m(utt)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m no_bots_phab_corpus \u001b[38;5;241m=\u001b[39m Corpus\u001b[38;5;241m.\u001b[39mfilter_utterances(phab_corpus, \u001b[38;5;28;01mlambda\u001b[39;00m utt: \u001b[43mutt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmeta\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgerrit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m no_bots_phab_corpus\u001b[38;5;241m.\u001b[39mprint_summary_stats()\n",
|
||||||
|
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/convoKitMeta.py:37\u001b[0m, in \u001b[0;36mConvoKitMeta.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, item):\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# in DB mode, metadata field mutation would not be updated. (ex. mutating dict/list metadata fields)\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;66;03m# we align MEM mode behavior and DB mode by making deepcopy of metadata fields, so mutation no longer\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# affect corpus metadata backend, but only acting on the copy of it.\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_backend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmeta\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_index\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobj_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 40\u001b[0m immutable_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28mcomplex\u001b[39m, \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mfrozenset\u001b[39m)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, immutable_types):\n",
|
||||||
|
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/convokit/model/backendMapper.py:173\u001b[0m, in \u001b[0;36mMemMapper.get_data\u001b[0;34m(self, component_type, component_id, property_name, index)\u001b[0m\n\u001b[1;32m 171\u001b[0m collection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_collection(component_type)\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m component_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m collection:\n\u001b[0;32m--> 173\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\n\u001b[1;32m 174\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis BackendMapper does not have an entry for the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcomponent_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 175\u001b[0m )\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m property_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m collection[component_id]\n",
|
||||||
|
"\u001b[0;31mKeyError\u001b[0m: 'This BackendMapper does not have an entry for the meta with id utterance_711.'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"no_bots_phab_corpus = Corpus.filter_utterances(phab_corpus, lambda utt: utt.meta['gerrit'] != True)\n",
|
||||||
|
"no_bots_phab_corpus.print_summary_stats()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 49,
|
||||||
|
"id": "dde1dde5-15b5-4291-997f-a0d772e5ecbc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#looking at how language use differs between the two groups \n",
|
||||||
|
"from convokit.text_processing import TextParser\n",
|
||||||
|
"parser = TextParser(input_field='text')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 50,
|
||||||
|
"id": "146d6526-b80f-4981-af2d-277b90852d5f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"050/9245 utterances processed\n",
|
||||||
|
"100/9245 utterances processed\n",
|
||||||
|
"150/9245 utterances processed\n",
|
||||||
|
"200/9245 utterances processed\n",
|
||||||
|
"250/9245 utterances processed\n",
|
||||||
|
"300/9245 utterances processed\n",
|
||||||
|
"350/9245 utterances processed\n",
|
||||||
|
"400/9245 utterances processed\n",
|
||||||
|
"450/9245 utterances processed\n",
|
||||||
|
"500/9245 utterances processed\n",
|
||||||
|
"550/9245 utterances processed\n",
|
||||||
|
"600/9245 utterances processed\n",
|
||||||
|
"650/9245 utterances processed\n",
|
||||||
|
"700/9245 utterances processed\n",
|
||||||
|
"750/9245 utterances processed\n",
|
||||||
|
"800/9245 utterances processed\n",
|
||||||
|
"850/9245 utterances processed\n",
|
||||||
|
"900/9245 utterances processed\n",
|
||||||
|
"950/9245 utterances processed\n",
|
||||||
|
"1000/9245 utterances processed\n",
|
||||||
|
"1050/9245 utterances processed\n",
|
||||||
|
"1100/9245 utterances processed\n",
|
||||||
|
"1150/9245 utterances processed\n",
|
||||||
|
"1200/9245 utterances processed\n",
|
||||||
|
"1250/9245 utterances processed\n",
|
||||||
|
"1300/9245 utterances processed\n",
|
||||||
|
"1350/9245 utterances processed\n",
|
||||||
|
"1400/9245 utterances processed\n",
|
||||||
|
"1450/9245 utterances processed\n",
|
||||||
|
"1500/9245 utterances processed\n",
|
||||||
|
"1550/9245 utterances processed\n",
|
||||||
|
"1600/9245 utterances processed\n",
|
||||||
|
"1650/9245 utterances processed\n",
|
||||||
|
"1700/9245 utterances processed\n",
|
||||||
|
"1750/9245 utterances processed\n",
|
||||||
|
"1800/9245 utterances processed\n",
|
||||||
|
"1850/9245 utterances processed\n",
|
||||||
|
"1900/9245 utterances processed\n",
|
||||||
|
"1950/9245 utterances processed\n",
|
||||||
|
"2000/9245 utterances processed\n",
|
||||||
|
"2050/9245 utterances processed\n",
|
||||||
|
"2100/9245 utterances processed\n",
|
||||||
|
"2150/9245 utterances processed\n",
|
||||||
|
"2200/9245 utterances processed\n",
|
||||||
|
"2250/9245 utterances processed\n",
|
||||||
|
"2300/9245 utterances processed\n",
|
||||||
|
"2350/9245 utterances processed\n",
|
||||||
|
"2400/9245 utterances processed\n",
|
||||||
|
"2450/9245 utterances processed\n",
|
||||||
|
"2500/9245 utterances processed\n",
|
||||||
|
"2550/9245 utterances processed\n",
|
||||||
|
"2600/9245 utterances processed\n",
|
||||||
|
"2650/9245 utterances processed\n",
|
||||||
|
"2700/9245 utterances processed\n",
|
||||||
|
"2750/9245 utterances processed\n",
|
||||||
|
"2800/9245 utterances processed\n",
|
||||||
|
"2850/9245 utterances processed\n",
|
||||||
|
"2900/9245 utterances processed\n",
|
||||||
|
"2950/9245 utterances processed\n",
|
||||||
|
"3000/9245 utterances processed\n",
|
||||||
|
"3050/9245 utterances processed\n",
|
||||||
|
"3100/9245 utterances processed\n",
|
||||||
|
"3150/9245 utterances processed\n",
|
||||||
|
"3200/9245 utterances processed\n",
|
||||||
|
"3250/9245 utterances processed\n",
|
||||||
|
"3300/9245 utterances processed\n",
|
||||||
|
"3350/9245 utterances processed\n",
|
||||||
|
"3400/9245 utterances processed\n",
|
||||||
|
"3450/9245 utterances processed\n",
|
||||||
|
"3500/9245 utterances processed\n",
|
||||||
|
"3550/9245 utterances processed\n",
|
||||||
|
"3600/9245 utterances processed\n",
|
||||||
|
"3650/9245 utterances processed\n",
|
||||||
|
"3700/9245 utterances processed\n",
|
||||||
|
"3750/9245 utterances processed\n",
|
||||||
|
"3800/9245 utterances processed\n",
|
||||||
|
"3850/9245 utterances processed\n",
|
||||||
|
"3900/9245 utterances processed\n",
|
||||||
|
"3950/9245 utterances processed\n",
|
||||||
|
"4000/9245 utterances processed\n",
|
||||||
|
"4050/9245 utterances processed\n",
|
||||||
|
"4100/9245 utterances processed\n",
|
||||||
|
"4150/9245 utterances processed\n",
|
||||||
|
"4200/9245 utterances processed\n",
|
||||||
|
"4250/9245 utterances processed\n",
|
||||||
|
"4300/9245 utterances processed\n",
|
||||||
|
"4350/9245 utterances processed\n",
|
||||||
|
"4400/9245 utterances processed\n",
|
||||||
|
"4450/9245 utterances processed\n",
|
||||||
|
"4500/9245 utterances processed\n",
|
||||||
|
"4550/9245 utterances processed\n",
|
||||||
|
"4600/9245 utterances processed\n",
|
||||||
|
"4650/9245 utterances processed\n",
|
||||||
|
"4700/9245 utterances processed\n",
|
||||||
|
"4750/9245 utterances processed\n",
|
||||||
|
"4800/9245 utterances processed\n",
|
||||||
|
"4850/9245 utterances processed\n",
|
||||||
|
"4900/9245 utterances processed\n",
|
||||||
|
"4950/9245 utterances processed\n",
|
||||||
|
"5000/9245 utterances processed\n",
|
||||||
|
"5050/9245 utterances processed\n",
|
||||||
|
"5100/9245 utterances processed\n",
|
||||||
|
"5150/9245 utterances processed\n",
|
||||||
|
"5200/9245 utterances processed\n",
|
||||||
|
"5250/9245 utterances processed\n",
|
||||||
|
"5300/9245 utterances processed\n",
|
||||||
|
"5350/9245 utterances processed\n",
|
||||||
|
"5400/9245 utterances processed\n",
|
||||||
|
"5450/9245 utterances processed\n",
|
||||||
|
"5500/9245 utterances processed\n",
|
||||||
|
"5550/9245 utterances processed\n",
|
||||||
|
"5600/9245 utterances processed\n",
|
||||||
|
"5650/9245 utterances processed\n",
|
||||||
|
"5700/9245 utterances processed\n",
|
||||||
|
"5750/9245 utterances processed\n",
|
||||||
|
"5800/9245 utterances processed\n",
|
||||||
|
"5850/9245 utterances processed\n",
|
||||||
|
"5900/9245 utterances processed\n",
|
||||||
|
"5950/9245 utterances processed\n",
|
||||||
|
"6000/9245 utterances processed\n",
|
||||||
|
"6050/9245 utterances processed\n",
|
||||||
|
"6100/9245 utterances processed\n",
|
||||||
|
"6150/9245 utterances processed\n",
|
||||||
|
"6200/9245 utterances processed\n",
|
||||||
|
"6250/9245 utterances processed\n",
|
||||||
|
"6300/9245 utterances processed\n",
|
||||||
|
"6350/9245 utterances processed\n",
|
||||||
|
"6400/9245 utterances processed\n",
|
||||||
|
"6450/9245 utterances processed\n",
|
||||||
|
"6500/9245 utterances processed\n",
|
||||||
|
"6550/9245 utterances processed\n",
|
||||||
|
"6600/9245 utterances processed\n",
|
||||||
|
"6650/9245 utterances processed\n",
|
||||||
|
"6700/9245 utterances processed\n",
|
||||||
|
"6750/9245 utterances processed\n",
|
||||||
|
"6800/9245 utterances processed\n",
|
||||||
|
"6850/9245 utterances processed\n",
|
||||||
|
"6900/9245 utterances processed\n",
|
||||||
|
"6950/9245 utterances processed\n",
|
||||||
|
"7000/9245 utterances processed\n",
|
||||||
|
"7050/9245 utterances processed\n",
|
||||||
|
"7100/9245 utterances processed\n",
|
||||||
|
"7150/9245 utterances processed\n",
|
||||||
|
"7200/9245 utterances processed\n",
|
||||||
|
"7250/9245 utterances processed\n",
|
||||||
|
"7300/9245 utterances processed\n",
|
||||||
|
"7350/9245 utterances processed\n",
|
||||||
|
"7400/9245 utterances processed\n",
|
||||||
|
"7450/9245 utterances processed\n",
|
||||||
|
"7500/9245 utterances processed\n",
|
||||||
|
"7550/9245 utterances processed\n",
|
||||||
|
"7600/9245 utterances processed\n",
|
||||||
|
"7650/9245 utterances processed\n",
|
||||||
|
"7700/9245 utterances processed\n",
|
||||||
|
"7750/9245 utterances processed\n",
|
||||||
|
"7800/9245 utterances processed\n",
|
||||||
|
"7850/9245 utterances processed\n",
|
||||||
|
"7900/9245 utterances processed\n",
|
||||||
|
"7950/9245 utterances processed\n",
|
||||||
|
"8000/9245 utterances processed\n",
|
||||||
|
"8050/9245 utterances processed\n",
|
||||||
|
"8100/9245 utterances processed\n",
|
||||||
|
"8150/9245 utterances processed\n",
|
||||||
|
"8200/9245 utterances processed\n",
|
||||||
|
"8250/9245 utterances processed\n",
|
||||||
|
"8300/9245 utterances processed\n",
|
||||||
|
"8350/9245 utterances processed\n",
|
||||||
|
"8400/9245 utterances processed\n",
|
||||||
|
"8450/9245 utterances processed\n",
|
||||||
|
"8500/9245 utterances processed\n",
|
||||||
|
"8550/9245 utterances processed\n",
|
||||||
|
"8600/9245 utterances processed\n",
|
||||||
|
"8650/9245 utterances processed\n",
|
||||||
|
"8700/9245 utterances processed\n",
|
||||||
|
"8750/9245 utterances processed\n",
|
||||||
|
"8800/9245 utterances processed\n",
|
||||||
|
"8850/9245 utterances processed\n",
|
||||||
|
"8900/9245 utterances processed\n",
|
||||||
|
"8950/9245 utterances processed\n",
|
||||||
|
"9000/9245 utterances processed\n",
|
||||||
|
"9050/9245 utterances processed\n",
|
||||||
|
"9100/9245 utterances processed\n",
|
||||||
|
"9150/9245 utterances processed\n",
|
||||||
|
"9200/9245 utterances processed\n",
|
||||||
|
"9245/9245 utterances processed\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"no_bots_phab_corpus = parser.transform(no_bots_phab_corpus)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.21"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user