1161 lines
200 KiB
Plaintext
1161 lines
200 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "ba9e5acd-e17d-4318-9272-04c9f6706186",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd \n",
|
||
"import spacy"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "e4f0b3f0-5255-46f1-822f-e455087ba315",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/051825_coref_rel_phab_comments.csv\"\n",
|
||
"phab_df = pd.read_csv(phab_path)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "ac5e624b-08a4-4ede-bc96-cfc26c3edac3",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def http_relevant(text):\n",
|
||
" if pd.isnull(text):\n",
|
||
" return False\n",
|
||
" # expanded dictionary for relevancy\n",
|
||
" # http, login, SSL, TLS, certificate \n",
|
||
" for word in text.split():\n",
|
||
" if \"://\" not in word.lower():\n",
|
||
" #http\n",
|
||
" if \"http\" in word.lower():\n",
|
||
" return True\n",
|
||
" #login\n",
|
||
" if \"login\" in word.lower():\n",
|
||
" return True\n",
|
||
" #ssl\n",
|
||
" if \"ssl\" in word.lower():\n",
|
||
" return True\n",
|
||
" #tls\n",
|
||
" if \"tls\" in word.lower():\n",
|
||
" return True\n",
|
||
" #cert\n",
|
||
" if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n",
|
||
" return True\n",
|
||
" return False"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "d5925c49-ea1d-4813-98aa-eae10d5879ca",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def is_migrated(comment_text):\n",
|
||
" if pd.isnull(comment_text):\n",
|
||
" return False\n",
|
||
" text = comment_text.strip()\n",
|
||
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
|
||
" return True \n",
|
||
" return False"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "c05f8b0d-ae4c-4cd5-8832-edb54e36ed9a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>task_title</th>\n",
|
||
" <th>comment_text</th>\n",
|
||
" <th>date_created</th>\n",
|
||
" <th>speaker</th>\n",
|
||
" <th>meta.affil</th>\n",
|
||
" <th>conversation_id</th>\n",
|
||
" <th>comment_type</th>\n",
|
||
" <th>status</th>\n",
|
||
" <th>meta.gerrit</th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>reply_to</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>is_relevant</th>\n",
|
||
" <th>is_migrated</th>\n",
|
||
" <th>text</th>\n",
|
||
" <th>resolved_text</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>time data error</td>\n",
|
||
" <td>After last update via SVN bot does not work, s...</td>\n",
|
||
" <td>1381482240</td>\n",
|
||
" <td>PHID-USER-wwnv7nzuscfuc2xfjwbq</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-qjt5coghg7n62wamkubq</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>resolved</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>115</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2013-10-11 09:04:00+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>After last update via SVN bot does not work, s...</td>\n",
|
||
" <td>After last update via SVN bot does not work, s...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>time data error</td>\n",
|
||
" <td>SVN r10320 is https://gerrit.wikimedia.org/r/8...</td>\n",
|
||
" <td>1381484030</td>\n",
|
||
" <td>PHID-USER-xezsyhikbr7hjrig2ofp</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-qjt5coghg7n62wamkubq</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>118</td>\n",
|
||
" <td>117.0</td>\n",
|
||
" <td>2013-10-11 09:33:50+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>SVN r10320 is https://gerrit.wikimedia.org/r/8...</td>\n",
|
||
" <td>SVN r10320 is https://gerrit.wikimedia.org/r/8...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>time data error</td>\n",
|
||
" <td>see also bug 55399</td>\n",
|
||
" <td>1381483747</td>\n",
|
||
" <td>PHID-USER-xezsyhikbr7hjrig2ofp</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-qjt5coghg7n62wamkubq</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>119</td>\n",
|
||
" <td>118.0</td>\n",
|
||
" <td>2013-10-11 09:29:07+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>see also bug 55399</td>\n",
|
||
" <td>see also bug 55399</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>time data error</td>\n",
|
||
" <td>It's a mess with these timestamps. Without tha...</td>\n",
|
||
" <td>1381483651</td>\n",
|
||
" <td>PHID-USER-xezsyhikbr7hjrig2ofp</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-qjt5coghg7n62wamkubq</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>120</td>\n",
|
||
" <td>119.0</td>\n",
|
||
" <td>2013-10-11 09:27:31+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>It's a mess with these timestamps. Without tha...</td>\n",
|
||
" <td>It's a mess with these timestamps. Without tha...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>time data error</td>\n",
|
||
" <td>When I go back from SVN revision 10320 to 1031...</td>\n",
|
||
" <td>1381482504</td>\n",
|
||
" <td>PHID-USER-wwnv7nzuscfuc2xfjwbq</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-qjt5coghg7n62wamkubq</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>121</td>\n",
|
||
" <td>120.0</td>\n",
|
||
" <td>2013-10-11 09:08:24+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>When I go back from SVN revision 10320 to 1031...</td>\n",
|
||
" <td>When I go back from SVN revision 10320 to 1031...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6510</th>\n",
|
||
" <td>VisualEditor: Automatic naming scheme for ref...</td>\n",
|
||
" <td>Intention:\\nRe-use a reference.\\n\\n\\nActual Re...</td>\n",
|
||
" <td>1385163660</td>\n",
|
||
" <td>PHID-USER-uf3buojo4ceizjywvyn5</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-j3rfh4pmjx4pel7dk2tn</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>duplicate</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>155659</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2013-11-22 23:41:00+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>Intention:\\nRe-use a reference.\\n\\n\\nActual Re...</td>\n",
|
||
" <td>Intention:\\nRe-use a reference.\\n\\n\\nActual Re...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6511</th>\n",
|
||
" <td>VisualEditor: Automatic naming scheme for ref...</td>\n",
|
||
" <td>Speaking as an extensive editor, I just find t...</td>\n",
|
||
" <td>1385399054</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-j3rfh4pmjx4pel7dk2tn</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>155661</td>\n",
|
||
" <td>155660.0</td>\n",
|
||
" <td>2013-11-25 17:04:14+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>Speaking as an extensive editor, I just find t...</td>\n",
|
||
" <td>Speaking as an extensive editor, I just find t...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6512</th>\n",
|
||
" <td>VisualEditor: Automatic naming scheme for ref...</td>\n",
|
||
" <td>I realize that any automagic system will have ...</td>\n",
|
||
" <td>1385397795</td>\n",
|
||
" <td>PHID-USER-uf3buojo4ceizjywvyn5</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-j3rfh4pmjx4pel7dk2tn</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>155662</td>\n",
|
||
" <td>155661.0</td>\n",
|
||
" <td>2013-11-25 16:43:15+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>I realize that any automagic system will have ...</td>\n",
|
||
" <td>I realize that any automagic system will have ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6513</th>\n",
|
||
" <td>VisualEditor: Automatic naming scheme for ref...</td>\n",
|
||
" <td>Why humans need to be able to remember the ref...</td>\n",
|
||
" <td>1385397298</td>\n",
|
||
" <td>PHID-USER-uf3buojo4ceizjywvyn5</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-j3rfh4pmjx4pel7dk2tn</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>155663</td>\n",
|
||
" <td>155662.0</td>\n",
|
||
" <td>2013-11-25 16:34:58+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>Why humans need to be able to remember the ref...</td>\n",
|
||
" <td>Why humans need to be able to remember the ref...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6514</th>\n",
|
||
" <td>VisualEditor: Automatic naming scheme for ref...</td>\n",
|
||
" <td>(In reply to comment #0)\\n> The ref naming sch...</td>\n",
|
||
" <td>1385394470</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-j3rfh4pmjx4pel7dk2tn</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>155664</td>\n",
|
||
" <td>155663.0</td>\n",
|
||
" <td>2013-11-25 15:47:50+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>(In reply to comment #0)\\n> The ref naming sch...</td>\n",
|
||
" <td>(In reply to comment #0)\\n> The ref naming sch...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6515 rows × 16 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" task_title \\\n",
|
||
"0 time data error \n",
|
||
"1 time data error \n",
|
||
"2 time data error \n",
|
||
"3 time data error \n",
|
||
"4 time data error \n",
|
||
"... ... \n",
|
||
"6510 VisualEditor: Automatic naming scheme for ref... \n",
|
||
"6511 VisualEditor: Automatic naming scheme for ref... \n",
|
||
"6512 VisualEditor: Automatic naming scheme for ref... \n",
|
||
"6513 VisualEditor: Automatic naming scheme for ref... \n",
|
||
"6514 VisualEditor: Automatic naming scheme for ref... \n",
|
||
"\n",
|
||
" comment_text date_created \\\n",
|
||
"0 After last update via SVN bot does not work, s... 1381482240 \n",
|
||
"1 SVN r10320 is https://gerrit.wikimedia.org/r/8... 1381484030 \n",
|
||
"2 see also bug 55399 1381483747 \n",
|
||
"3 It's a mess with these timestamps. Without tha... 1381483651 \n",
|
||
"4 When I go back from SVN revision 10320 to 1031... 1381482504 \n",
|
||
"... ... ... \n",
|
||
"6510 Intention:\\nRe-use a reference.\\n\\n\\nActual Re... 1385163660 \n",
|
||
"6511 Speaking as an extensive editor, I just find t... 1385399054 \n",
|
||
"6512 I realize that any automagic system will have ... 1385397795 \n",
|
||
"6513 Why humans need to be able to remember the ref... 1385397298 \n",
|
||
"6514 (In reply to comment #0)\\n> The ref naming sch... 1385394470 \n",
|
||
"\n",
|
||
" speaker meta.affil \\\n",
|
||
"0 PHID-USER-wwnv7nzuscfuc2xfjwbq False \n",
|
||
"1 PHID-USER-xezsyhikbr7hjrig2ofp False \n",
|
||
"2 PHID-USER-xezsyhikbr7hjrig2ofp False \n",
|
||
"3 PHID-USER-xezsyhikbr7hjrig2ofp False \n",
|
||
"4 PHID-USER-wwnv7nzuscfuc2xfjwbq False \n",
|
||
"... ... ... \n",
|
||
"6510 PHID-USER-uf3buojo4ceizjywvyn5 True \n",
|
||
"6511 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"6512 PHID-USER-uf3buojo4ceizjywvyn5 True \n",
|
||
"6513 PHID-USER-uf3buojo4ceizjywvyn5 True \n",
|
||
"6514 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"\n",
|
||
" conversation_id comment_type status \\\n",
|
||
"0 PHID-TASK-qjt5coghg7n62wamkubq task_description resolved \n",
|
||
"1 PHID-TASK-qjt5coghg7n62wamkubq task_subcomment NaN \n",
|
||
"2 PHID-TASK-qjt5coghg7n62wamkubq task_subcomment NaN \n",
|
||
"3 PHID-TASK-qjt5coghg7n62wamkubq task_subcomment NaN \n",
|
||
"4 PHID-TASK-qjt5coghg7n62wamkubq task_subcomment NaN \n",
|
||
"... ... ... ... \n",
|
||
"6510 PHID-TASK-j3rfh4pmjx4pel7dk2tn task_description duplicate \n",
|
||
"6511 PHID-TASK-j3rfh4pmjx4pel7dk2tn task_subcomment NaN \n",
|
||
"6512 PHID-TASK-j3rfh4pmjx4pel7dk2tn task_subcomment NaN \n",
|
||
"6513 PHID-TASK-j3rfh4pmjx4pel7dk2tn task_subcomment NaN \n",
|
||
"6514 PHID-TASK-j3rfh4pmjx4pel7dk2tn task_subcomment NaN \n",
|
||
"\n",
|
||
" meta.gerrit id reply_to timestamp is_relevant \\\n",
|
||
"0 False 115 NaN 2013-10-11 09:04:00+00:00 True \n",
|
||
"1 False 118 117.0 2013-10-11 09:33:50+00:00 True \n",
|
||
"2 False 119 118.0 2013-10-11 09:29:07+00:00 True \n",
|
||
"3 False 120 119.0 2013-10-11 09:27:31+00:00 True \n",
|
||
"4 False 121 120.0 2013-10-11 09:08:24+00:00 True \n",
|
||
"... ... ... ... ... ... \n",
|
||
"6510 False 155659 NaN 2013-11-22 23:41:00+00:00 True \n",
|
||
"6511 False 155661 155660.0 2013-11-25 17:04:14+00:00 True \n",
|
||
"6512 False 155662 155661.0 2013-11-25 16:43:15+00:00 True \n",
|
||
"6513 False 155663 155662.0 2013-11-25 16:34:58+00:00 True \n",
|
||
"6514 False 155664 155663.0 2013-11-25 15:47:50+00:00 True \n",
|
||
"\n",
|
||
" is_migrated text \\\n",
|
||
"0 False After last update via SVN bot does not work, s... \n",
|
||
"1 False SVN r10320 is https://gerrit.wikimedia.org/r/8... \n",
|
||
"2 False see also bug 55399 \n",
|
||
"3 False It's a mess with these timestamps. Without tha... \n",
|
||
"4 False When I go back from SVN revision 10320 to 1031... \n",
|
||
"... ... ... \n",
|
||
"6510 False Intention:\\nRe-use a reference.\\n\\n\\nActual Re... \n",
|
||
"6511 False Speaking as an extensive editor, I just find t... \n",
|
||
"6512 False I realize that any automagic system will have ... \n",
|
||
"6513 False Why humans need to be able to remember the ref... \n",
|
||
"6514 False (In reply to comment #0)\\n> The ref naming sch... \n",
|
||
"\n",
|
||
" resolved_text \n",
|
||
"0 After last update via SVN bot does not work, s... \n",
|
||
"1 SVN r10320 is https://gerrit.wikimedia.org/r/8... \n",
|
||
"2 see also bug 55399 \n",
|
||
"3 It's a mess with these timestamps. Without tha... \n",
|
||
"4 When I go back from SVN revision 10320 to 1031... \n",
|
||
"... ... \n",
|
||
"6510 Intention:\\nRe-use a reference.\\n\\n\\nActual Re... \n",
|
||
"6511 Speaking as an extensive editor, I just find t... \n",
|
||
"6512 I realize that any automagic system will have ... \n",
|
||
"6513 Why humans need to be able to remember the ref... \n",
|
||
"6514 (In reply to comment #0)\\n> The ref naming sch... \n",
|
||
"\n",
|
||
"[6515 rows x 16 columns]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"phab_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "d449164e-1d28-4580-9eb1-f0f69978f114",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
|
||
"phab_df['isGerrit'] = phab_df['speaker'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
|
||
"\n",
|
||
"#cleaning df\n",
|
||
"#phab_df['id'] = phab_df.index + 1\n",
|
||
"#may have to build out the reply_to column \n",
|
||
"#phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
|
||
"#phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
|
||
"\n",
|
||
"#phab_df = phab_df.rename(columns={\n",
|
||
"# 'AuthorPHID': 'speaker',\n",
|
||
"# 'TaskPHID': 'conversation_id',\n",
|
||
"# 'WMFaffil':'meta.affil',\n",
|
||
"# 'isGerrit': 'meta.gerrit'\n",
|
||
"#})\n",
|
||
"\n",
|
||
"# after 12-1-2012 before 12-1-2013\n",
|
||
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
|
||
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1385596799) & (phab_df['date_created'] > 1315008000)]\n",
|
||
"\n",
|
||
"#removing headless conversations\n",
|
||
"task_phab_df = phab_df[phab_df['comment_type']==\"task_description\"]\n",
|
||
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
|
||
"filtered_phab_df = phab_df[phab_df['conversation_id'].isin(headed_task_phids)]\n",
|
||
"\n",
|
||
"#removing gerrit comments \n",
|
||
"#mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
|
||
"\n",
|
||
"'''\n",
|
||
"# filter out the sourceforge migration \n",
|
||
"# Originally from: http://sourceforge.net in the task task_summary\n",
|
||
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
|
||
"\n",
|
||
"#cut down to only the data that is relevant (mentions http)\n",
|
||
"relevant_conversation_ids = task_phab_df[\n",
|
||
" task_phab_df['comment_text'].apply(http_relevant) |\n",
|
||
" task_phab_df['task_title'].apply(http_relevant)\n",
|
||
"]['conversation_id'].unique()\n",
|
||
"\n",
|
||
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
||
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
||
"\n",
|
||
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
||
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
||
"'''\n",
|
||
"#comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
|
||
"#task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
|
||
"comment_phab_df = filtered_phab_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "942344db-c8f5-4ed6-a757-c97f8454f18b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Unique conversation_ids: 1074\n",
|
||
"Unique ids: 6515\n",
|
||
"Unique speakers: 305\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n",
|
||
"unique_ids = len(comment_phab_df['id'].unique())\n",
|
||
"unique_speakers = len(comment_phab_df['speaker'].unique())\n",
|
||
"\n",
|
||
"print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n",
|
||
"print(f\"Unique ids: {unique_ids}\")\n",
|
||
"print(f\"Unique speakers: {unique_speakers}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "d226d781-b002-4842-a3ae-92d4851a5878",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import re\n",
|
||
"\n",
|
||
"def preprocess_text(text):\n",
|
||
" text = str(text)\n",
|
||
" text = text.replace('*', ' ')\n",
|
||
" text = text.replace('-', ' ')\n",
|
||
" text = re.sub(r'http\\S+', '', text)\n",
|
||
" return text"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "3ae40d24-bbe8-49c3-a3a9-70bde1b4d559",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"comment_phab_df['processed_text'] = comment_phab_df['comment_text'].apply(preprocess_text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "b8eddf40-1fe2-4fce-be74-b32552b40c57",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"comment_phab_df['processed_resolved_text'] = comment_phab_df['resolved_text'].apply(preprocess_text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "a8469b16-4ae6-4b06-bf1b-1f2f6c736cab",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"nlp = spacy.load(\"en_core_web_sm\")\n",
|
||
"\n",
|
||
"def extract_dependency_tree(text):\n",
|
||
" doc = nlp(text)\n",
|
||
" dependency_trees = []\n",
|
||
" \n",
|
||
" for sentence in doc.sents:\n",
|
||
" for token in sentence:\n",
|
||
" token_info = (\n",
|
||
" token.text, \n",
|
||
" token.lemma_, \n",
|
||
" token.dep_, \n",
|
||
" token.head.text, \n",
|
||
" list(token.ancestors), \n",
|
||
" list(token.subtree), \n",
|
||
" list(token.children)\n",
|
||
" )\n",
|
||
" dependency_trees.append(token_info)\n",
|
||
" \n",
|
||
" return dependency_trees"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "8b9a12f9-71bf-4bc9-bcfd-c73aab4be920",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "337a528a-5667-4e1f-ac9a-37caabc03a18",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"comment_phab_df['resolved_dependency_tree'] = comment_phab_df['processed_resolved_text'].apply(extract_dependency_tree)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "1b51f395-aaa9-4bf2-9c67-c1bc4640a89a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/051825_coref_resolved_dep_trees.csv\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "a3f5d40b-f56e-4e31-a7f9-40b7ddb4d2a4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#get VAD scores\n",
|
||
"import numpy as np\n",
|
||
"#https://saifmohammad.com/WebPages/nrc-vad.html\n",
|
||
"column_headings = ['Word', 'Valence', 'Arousal', 'Domination']\n",
|
||
"vad_lexicon = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\\t', header=None, names=column_headings)\n",
|
||
"vad_dict = vad_lexicon.set_index('Word').T.to_dict()\n",
|
||
"\n",
|
||
"def vad_scoring(dependency_tree):\n",
|
||
" valence = []\n",
|
||
" arousal = []\n",
|
||
" dominance = []\n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
||
" if lemma in vad_dict:\n",
|
||
" valence.append(vad_dict[lemma]['Valence'])\n",
|
||
" arousal.append(vad_dict[lemma]['Arousal'])\n",
|
||
" dominance.append(vad_dict[lemma]['Domination'])\n",
|
||
"\n",
|
||
" # Compute average scores across the comment\n",
|
||
" avg_valence = np.mean(valence) if valence else 0\n",
|
||
" avg_arousal = np.mean(arousal) if arousal else 0\n",
|
||
" avg_dominance = np.mean(dominance) if dominance else 0\n",
|
||
"\n",
|
||
" return [avg_valence, avg_arousal, avg_dominance]\n",
|
||
"\n",
|
||
"def dominance_prevail(dependency_tree):\n",
|
||
" dominant_words = 0 \n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
||
" if lemma in vad_dict:\n",
|
||
" if vad_dict[lemma]['Domination'] >= 0.75:\n",
|
||
" dominant_words += 1\n",
|
||
" if vad_dict[lemma]['Domination'] <= 0.25:\n",
|
||
" dominant_words += 1\n",
|
||
" return dominant_words\n",
|
||
"\n",
|
||
"def arousal_prevail(dependency_tree):\n",
|
||
" arousal_words = 0 \n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
||
" if lemma in vad_dict:\n",
|
||
" if vad_dict[lemma]['Arousal'] >= 0.75:\n",
|
||
" arousal_words += 1\n",
|
||
" if vad_dict[lemma]['Arousal'] <= 0.25:\n",
|
||
" arousal_words += 1\n",
|
||
" return arousal_words\n",
|
||
"\n",
|
||
"def valence_prevail(dependency_tree):\n",
|
||
" valence_words = 0 \n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
||
" if lemma in vad_dict:\n",
|
||
" if vad_dict[lemma]['Valence'] >= 0.75:\n",
|
||
" valence_words += 1\n",
|
||
" if vad_dict[lemma]['Valence'] <= 0.25:\n",
|
||
" valence_words += 1\n",
|
||
" return valence_words\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "828fb57a-e152-42ef-9c60-660648898532",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#establishing per-comment VAD scores \n",
|
||
"comment_phab_df['avg_vad_scores'] = comment_phab_df['dependency_tree'].apply(vad_scoring)\n",
|
||
"comment_phab_df['dominant_wc'] = comment_phab_df['dependency_tree'].apply(dominance_prevail)\n",
|
||
"comment_phab_df['arousal_wc'] = comment_phab_df['dependency_tree'].apply(arousal_prevail)\n",
|
||
"comment_phab_df['valence_wc'] = comment_phab_df['dependency_tree'].apply(valence_prevail)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "27e47f6f-0257-4b70-b222-e91ef888c900",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"comment_phab_df[['average_v_score', 'average_a_score', 'average_d_score']] = pd.DataFrame(comment_phab_df['avg_vad_scores'].tolist(), index=comment_phab_df.index)\n",
|
||
"comment_phab_df = comment_phab_df.drop(columns=['avg_vad_scores'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "09ddcbfc-b856-40ca-ad61-13577795d94b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import datetime"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "184ccbe6-0a7a-41b8-9b02-bc439ff975d0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# expand the dependency parser \n",
|
||
"\n",
|
||
"#pattern = r'\\b(ve|VE|visualeditor|VisualEditor)\\b'\n",
|
||
"#pattern = r'\\b(WMF|Foundation)\\b'\n",
|
||
"#pattern = r'\\b(bots|scripts|gadgets)\\b'\n",
|
||
"pattern = r'\\b(http|https)\\b'\n",
|
||
"\n",
|
||
"dependency_relations = []\n",
|
||
"resolved_dependency_relations = []\n",
|
||
"\n",
|
||
"for index, row in comment_phab_df.iterrows():\n",
|
||
" text = row['comment_text']\n",
|
||
" timestamp = row['timestamp']\n",
|
||
" comment_id = row['id']\n",
|
||
" conversation_id = row['conversation_id']\n",
|
||
" WMFaffil = row['meta.affil']\n",
|
||
" \n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in row['dependency_tree']:\n",
|
||
" dependency_relations.append({\n",
|
||
" 'comment_id': comment_id,\n",
|
||
" 'timestamp': timestamp,\n",
|
||
" 'wmfAffil':WMFaffil,\n",
|
||
" 'token': token,\n",
|
||
" 'dependency': dep,\n",
|
||
" 'head': head,\n",
|
||
" 'depth': len(list(ancestors)), \n",
|
||
" 'children': len(list(children)) \n",
|
||
" })\n",
|
||
" \n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in row['resolved_dependency_tree']:\n",
|
||
" resolved_dependency_relations.append({\n",
|
||
" 'comment_id': comment_id,\n",
|
||
" 'timestamp': timestamp,\n",
|
||
" 'wmfAffil':WMFaffil,\n",
|
||
" 'token': token,\n",
|
||
" 'dependency': dep,\n",
|
||
" 'head': head,\n",
|
||
" 'depth': len(list(ancestors)), \n",
|
||
" 'children': len(list(children)) \n",
|
||
" })\n",
|
||
"\n",
|
||
"resolved_dependency_relations_df = pd.DataFrame(resolved_dependency_relations) \n",
|
||
"dependency_relations_df = pd.DataFrame(dependency_relations)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "82498686-14f4-40c8-9e33-27b31f115b47",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#now analysis/plotting \n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from matplotlib.gridspec import GridSpec"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "5a91a59a-0d1c-48b3-93dd-b9df76ca68e5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<seaborn.axisgrid.FacetGrid at 0x14ca72b957f0>"
|
||
]
|
||
},
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1333.5x500 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"plot2 = sns.lmplot(data=affective_comment_phab_df, x=\"speakers_comment\", y=\"polarized_wc\", hue=\"date_group\", col=\"meta.affil\", scatter=False, legend=False, palette=palette)\n",
|
||
"plot2.set_axis_labels(\"Index of Speaker's Comment\", \"Count of Polarized Words\")\n",
|
||
"plot2.set_titles(col_template=\"WMF Affiliation: {col_name}\")\n",
|
||
"plot2.fig.subplots_adjust(top=0.9) # Adjust subplots to make room for the title\n",
|
||
"plot2.add_legend(title=\"Comment publication timestamp:\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "2274795e-c64d-43e4-b0f5-a19b5b8ba2c8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>comment_id</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>wmfAffil</th>\n",
|
||
" <th>token</th>\n",
|
||
" <th>dependency</th>\n",
|
||
" <th>head</th>\n",
|
||
" <th>depth</th>\n",
|
||
" <th>children</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>115</td>\n",
|
||
" <td>2013-10-11 09:04:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>use_api_login</td>\n",
|
||
" <td>dobj</td>\n",
|
||
" <td>use_api</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>157</td>\n",
|
||
" <td>2013-10-07 08:09:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>use_api_login</td>\n",
|
||
" <td>dobj</td>\n",
|
||
" <td>use_api</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>177</td>\n",
|
||
" <td>2013-10-04 17:56:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>certainly</td>\n",
|
||
" <td>advmod</td>\n",
|
||
" <td>require</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>247</td>\n",
|
||
" <td>2013-09-27 22:15:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>Login</td>\n",
|
||
" <td>ROOT</td>\n",
|
||
" <td>Login</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>426</td>\n",
|
||
" <td>2013-09-01 11:26:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>HTTP</td>\n",
|
||
" <td>compound</td>\n",
|
||
" <td>login</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1463</th>\n",
|
||
" <td>45300</td>\n",
|
||
" <td>2013-08-01 17:35:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>certain</td>\n",
|
||
" <td>amod</td>\n",
|
||
" <td>commands</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1464</th>\n",
|
||
" <td>45300</td>\n",
|
||
" <td>2013-08-01 17:35:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>certain</td>\n",
|
||
" <td>amod</td>\n",
|
||
" <td>commands</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1465</th>\n",
|
||
" <td>45373</td>\n",
|
||
" <td>2013-07-27 13:30:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>certain</td>\n",
|
||
" <td>amod</td>\n",
|
||
" <td>element</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1466</th>\n",
|
||
" <td>46078</td>\n",
|
||
" <td>2013-06-18 21:17:00+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>HTTP</td>\n",
|
||
" <td>compound</td>\n",
|
||
" <td>Error</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1467</th>\n",
|
||
" <td>46086</td>\n",
|
||
" <td>2013-06-19 23:31:02+00:00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>HTTP</td>\n",
|
||
" <td>compound</td>\n",
|
||
" <td>Error</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1468 rows × 8 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" comment_id timestamp wmfAffil token \\\n",
|
||
"0 115 2013-10-11 09:04:00+00:00 False use_api_login \n",
|
||
"1 157 2013-10-07 08:09:00+00:00 False use_api_login \n",
|
||
"2 177 2013-10-04 17:56:00+00:00 False certainly \n",
|
||
"3 247 2013-09-27 22:15:00+00:00 False Login \n",
|
||
"4 426 2013-09-01 11:26:00+00:00 False HTTP \n",
|
||
"... ... ... ... ... \n",
|
||
"1463 45300 2013-08-01 17:35:00+00:00 False certain \n",
|
||
"1464 45300 2013-08-01 17:35:00+00:00 False certain \n",
|
||
"1465 45373 2013-07-27 13:30:00+00:00 False certain \n",
|
||
"1466 46078 2013-06-18 21:17:00+00:00 False HTTP \n",
|
||
"1467 46086 2013-06-19 23:31:02+00:00 False HTTP \n",
|
||
"\n",
|
||
" dependency head depth children \n",
|
||
"0 dobj use_api 1 6 \n",
|
||
"1 dobj use_api 1 4 \n",
|
||
"2 advmod require 2 1 \n",
|
||
"3 ROOT Login 0 4 \n",
|
||
"4 compound login 4 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"1463 amod commands 5 0 \n",
|
||
"1464 amod commands 5 0 \n",
|
||
"1465 amod element 8 0 \n",
|
||
"1466 compound Error 2 0 \n",
|
||
"1467 compound Error 3 0 \n",
|
||
"\n",
|
||
"[1468 rows x 8 columns]"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"resolved_dependency_relations_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "d2d67d38-f005-4c94-be3c-39eb6b22686f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_44915/3534785199.py:8: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
|
||
" filtered_dependencies = dependency_relations_df[dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
||
"/tmp/ipykernel_44915/3534785199.py:9: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
|
||
" resolved_filtered_dependencies = resolved_dependency_relations_df[resolved_dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
||
"/tmp/ipykernel_44915/3534785199.py:24: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
|
||
" filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"/tmp/ipykernel_44915/3534785199.py:24: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"/tmp/ipykernel_44915/3534785199.py:45: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" resolved_filtered_dependencies['timestamp'] = pd.to_datetime(resolved_filtered_dependencies['timestamp'], utc=True)\n",
|
||
"/tmp/ipykernel_44915/3534785199.py:46: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
|
||
" resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"/tmp/ipykernel_44915/3534785199.py:46: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1200x800 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"#pattern = r'\\b(ve|VE|visualeditor|VisualEditor)\\b'\n",
|
||
"#pattern = r'\\b(contributor|community|volunteer)\\b'\n",
|
||
"#pattern = r'\\b(WMF|Foundation|Wikimedia)\\b'\n",
|
||
"pattern = r'\\b(bots|scripts|gadgets)\\b'\n",
|
||
"#pattern = r'\\b(http|https)\\b'\n",
|
||
"#pattern = r'\\b(auth)\\b'\n",
|
||
"\n",
|
||
"filtered_dependencies = dependency_relations_df[dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
||
"resolved_filtered_dependencies = resolved_dependency_relations_df[resolved_dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
||
"\n",
|
||
"plt.figure(figsize=(12, 8))\n",
|
||
"gs = GridSpec(2, 1, height_ratios=[6, 6])\n",
|
||
"\n",
|
||
"# Main plot: Token depth by timestamp\n",
|
||
"'''\n",
|
||
"ax0 = plt.subplot(gs[0])\n",
|
||
"sns.scatterplot(data=filtered_dependencies, x='timestamp', y='dependency', hue='wmfAffil', style='dependency', markers=True, s=100, ax=ax0)\n",
|
||
"ax0.set_title('VE Depth by Timestamp w/o URLS')\n",
|
||
"ax0.set_xlabel('')\n",
|
||
"ax0.set_ylabel('Dependency Type')\n",
|
||
"ax0.legend().set_visible(False)\n",
|
||
"'''\n",
|
||
"# Calculate the median depth over time\n",
|
||
"filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"median_depth = filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"wmf_filtered_dependencies = filtered_dependencies[filtered_dependencies['wmfAffil'] == True]\n",
|
||
"#wmf_median_depth = wmf_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"other_filtered_dependencies = filtered_dependencies[filtered_dependencies['wmfAffil'] != True]\n",
|
||
"#other_median_depth = other_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"# Plot the median depth over time\n",
|
||
"ax0 = plt.subplot(gs[0])\n",
|
||
"#sns.lineplot(data=median_depth, x='week', y='depth', ax=ax0, color='black', label='Median Depth', marker='o')\n",
|
||
"sns.scatterplot(data=wmf_filtered_dependencies, x='week', y='depth', ax=ax0, color='#c7756a', label='WMF-affiliated authors', marker='o')\n",
|
||
"#sns.lineplot(data=wmf_median_depth, x='week', y='depth', ax=ax0, color='#c7756a', label='WMF-affiliated authors', marker='x')\n",
|
||
"sns.scatterplot(data=other_filtered_dependencies, x='week', y='depth', ax=ax0, color='#5da2d8', label='Nonaffiliated authors', marker='o')\n",
|
||
"#sns.lineplot(data=other_median_depth, x='week', y='depth', ax=ax0, color='#5da2d8', label='Nonaffiliated authors', marker='x')\n",
|
||
"ax0.set_title(f'Depth of {pattern} in Phabricator Sentence Dependency Trees')\n",
|
||
"ax0.set_ylabel('Median Depth')\n",
|
||
"ax0.set_xlabel('')\n",
|
||
"\n",
|
||
"# Calculate the median depth over time\n",
|
||
"resolved_filtered_dependencies['timestamp'] = pd.to_datetime(resolved_filtered_dependencies['timestamp'], utc=True)\n",
|
||
"resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"resolved_median_depth = resolved_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"resolved_wmf_filtered_dependencies = resolved_filtered_dependencies[resolved_filtered_dependencies['wmfAffil'] == True]\n",
|
||
"#resolved_wmf_median_depth = resolved_wmf_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"resolved_other_filtered_dependencies = resolved_filtered_dependencies[resolved_filtered_dependencies['wmfAffil'] != True]\n",
|
||
"#resolved_other_median_depth = resolved_other_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"# Plot the median depth over time\n",
|
||
"ax1 = plt.subplot(gs[1])\n",
|
||
"#sns.lineplot(data=resolved_median_depth, x='week', y='depth', ax=ax1, color='black', label='Median Depth', marker='o')\n",
|
||
"sns.scatterplot(data=resolved_wmf_filtered_dependencies, x='week', y='depth', ax=ax1, color='#c7756a', label='WMF-affiliated authors', marker='o')\n",
|
||
"#sns.lineplot(data=resolved_wmf_median_depth, x='week', y='depth', ax=ax1, color='#c7756a', label='WMF-affiliated authors', marker='x')\n",
|
||
"sns.scatterplot(data=resolved_other_filtered_dependencies, x='week', y='depth', ax=ax1, color='#5da2d8', label='Nonaffiliated authors', marker='o')\n",
|
||
"#sns.lineplot(data=resolved_other_median_depth, x='week', y='depth', ax=ax1, color='#5da2d8', label='Nonaffiliated authors', marker='x')\n",
|
||
"ax1.set_title(f'Depth of {pattern} in Coreference-resolved Phabricator Sentence Dependency Trees')\n",
|
||
"ax1.set_ylabel('Median Depth')\n",
|
||
"ax1.set_xlabel('')\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"#plt.show()\n",
|
||
"\n",
|
||
"#plt.savefig('031625_VE_depth_fig.png')"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|