1404 lines
291 KiB
Plaintext
1404 lines
291 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "ba9e5acd-e17d-4318-9272-04c9f6706186",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd \n",
|
||
"import spacy"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "e4f0b3f0-5255-46f1-822f-e455087ba315",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0312_resolved_ve_phab_comments.csv\"\n",
|
||
"phab_df = pd.read_csv(phab_path)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "d449164e-1d28-4580-9eb1-f0f69978f114",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
|
||
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
|
||
"#cleaning df\n",
|
||
"phab_df['id'] = phab_df.index + 1\n",
|
||
"#may have to build out the reply_to column \n",
|
||
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
|
||
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
|
||
"\n",
|
||
"phab_df = phab_df.rename(columns={\n",
|
||
" 'AuthorPHID': 'speaker',\n",
|
||
" 'TaskPHID': 'conversation_id',\n",
|
||
" 'WMFaffil':'meta.affil',\n",
|
||
" 'isGerrit': 'meta.gerrit'\n",
|
||
"})\n",
|
||
"\n",
|
||
"# after 11-1-2012 before 11-1-2013\n",
|
||
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
|
||
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1383264000) & (phab_df['date_created'] > 1351728000)]\n",
|
||
"\n",
|
||
"#removing headless conversations\n",
|
||
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
|
||
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
|
||
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
|
||
"\n",
|
||
"#removing gerrit comments \n",
|
||
"comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "942344db-c8f5-4ed6-a757-c97f8454f18b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Unique conversation_ids: 2081\n",
|
||
"Unique ids: 8804\n",
|
||
"Unique speakers: 230\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n",
|
||
"unique_ids = len(comment_phab_df['id'].unique())\n",
|
||
"unique_speakers = len(comment_phab_df['speaker'].unique())\n",
|
||
"\n",
|
||
"print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n",
|
||
"print(f\"Unique ids: {unique_ids}\")\n",
|
||
"print(f\"Unique speakers: {unique_speakers}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "c0aade6b-f425-4f9b-ae2a-721ea49712ee",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>task_title</th>\n",
|
||
" <th>comment_text</th>\n",
|
||
" <th>date_created</th>\n",
|
||
" <th>speaker</th>\n",
|
||
" <th>meta.affil</th>\n",
|
||
" <th>conversation_id</th>\n",
|
||
" <th>comment_type</th>\n",
|
||
" <th>status</th>\n",
|
||
" <th>text</th>\n",
|
||
" <th>resolved_text</th>\n",
|
||
" <th>meta.gerrit</th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>reply_to</th>\n",
|
||
" <th>first_comment</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>708</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>Tested on both the Italian and the English Wik...</td>\n",
|
||
" <td>1380976920</td>\n",
|
||
" <td>PHID-USER-wil4b5lylrvf3krixlkl</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>resolved</td>\n",
|
||
" <td>Tested on both the Italian and the English Wik...</td>\n",
|
||
" <td>Tested on both the Italian and the English Wik...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>709</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-05 12:42:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>709</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>Note that this is fixed and has been deployed ...</td>\n",
|
||
" <td>1381281033</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Note that this is fixed and has been deployed ...</td>\n",
|
||
" <td>Note that this is fixed and has been deployed ...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>710</td>\n",
|
||
" <td>709.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-09 01:10:33+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>712</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>*** Bug 55362 has been marked as a duplicate o...</td>\n",
|
||
" <td>1381267451</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>*** Bug 55362 has been marked as a duplicate o...</td>\n",
|
||
" <td>*** Bug 55362 has been marked as a duplicate o...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>713</td>\n",
|
||
" <td>712.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-08 21:24:11+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>717</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>(In reply to comment #6)\\n> Krinkle, do I need...</td>\n",
|
||
" <td>1381168024</td>\n",
|
||
" <td>PHID-USER-sai77mtxmpqnm6pycyvz</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>(In reply to comment #6)\\n> Krinkle, do I need...</td>\n",
|
||
" <td>(In reply to comment #6)\\n> Krinkle, do I need...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>718</td>\n",
|
||
" <td>717.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-07 17:47:04+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>718</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>Krinkle, do I need to file a different bug for...</td>\n",
|
||
" <td>1381142922</td>\n",
|
||
" <td>PHID-USER-wil4b5lylrvf3krixlkl</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Krinkle, do I need to file a different bug for...</td>\n",
|
||
" <td>Krinkle, do Krinkle need to file a different b...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>719</td>\n",
|
||
" <td>718.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-07 10:48:42+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32172</th>\n",
|
||
" <td>Setup wikibugs and gerrit-wm for #mediawiki-vi...</td>\n",
|
||
" <td>Puppet config for wikibugs:\\n\\nhttps://gerrit....</td>\n",
|
||
" <td>1354738560</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-ciosa56mnibqn4lx27ub</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>resolved</td>\n",
|
||
" <td>Puppet config for wikibugs:\\n\\nhttps://gerrit....</td>\n",
|
||
" <td>Puppet config for wikibugs:\\n\\nhttps://gerrit....</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>32173</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2012-12-05 20:16:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32178</th>\n",
|
||
" <td>Setup wikibugs and gerrit-wm for #mediawiki-vi...</td>\n",
|
||
" <td>gerrit-wm is done, but wikibugs is \"an almight...</td>\n",
|
||
" <td>1360206228</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-ciosa56mnibqn4lx27ub</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>gerrit-wm is done, but wikibugs is \"an almight...</td>\n",
|
||
" <td>gerrit-wm is done, but wikibugs is \"an almight...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>32179</td>\n",
|
||
" <td>32178.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-02-07 03:03:48+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32179</th>\n",
|
||
" <td>Setup wikibugs and gerrit-wm for #mediawiki-vi...</td>\n",
|
||
" <td>Attempted fixes in Gerrit 37566 and Gerrit 37570.</td>\n",
|
||
" <td>1354926921</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-ciosa56mnibqn4lx27ub</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Attempted fixes in Gerrit 37566 and Gerrit 37570.</td>\n",
|
||
" <td>Attempted fixes in Gerrit 37566 and Gerrit 37570.</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>32180</td>\n",
|
||
" <td>32179.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2012-12-08 00:35:21+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32180</th>\n",
|
||
" <td>VisualEditor: Two replacements within the same...</td>\n",
|
||
" <td>Test case:\\n\\n+ 'removin...</td>\n",
|
||
" <td>1353134520</td>\n",
|
||
" <td>PHID-USER-fovtl67ew4l4cc3oeypc</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-guukovmsjsnlpphgujcv</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>invalid</td>\n",
|
||
" <td>Test case:\\n\\n+ 'removin...</td>\n",
|
||
" <td>Test case:\\n\\n+ 'removin...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>32181</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2012-11-17 06:42:00+00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32181</th>\n",
|
||
" <td>VisualEditor: Two replacements within the same...</td>\n",
|
||
" <td>With bug 45061 all change marker code has been...</td>\n",
|
||
" <td>1360975473</td>\n",
|
||
" <td>PHID-USER-it53o2f2kyryqyj33uzt</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-guukovmsjsnlpphgujcv</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>With bug 45061 all change marker code has been...</td>\n",
|
||
" <td>With bug 45061 all change marker code has been...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>32182</td>\n",
|
||
" <td>32181.0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-02-16 00:44:33+00:00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8804 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" task_title \\\n",
|
||
"708 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"709 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"712 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"717 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"718 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"... ... \n",
|
||
"32172 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n",
|
||
"32178 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n",
|
||
"32179 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n",
|
||
"32180 VisualEditor: Two replacements within the same... \n",
|
||
"32181 VisualEditor: Two replacements within the same... \n",
|
||
"\n",
|
||
" comment_text date_created \\\n",
|
||
"708 Tested on both the Italian and the English Wik... 1380976920 \n",
|
||
"709 Note that this is fixed and has been deployed ... 1381281033 \n",
|
||
"712 *** Bug 55362 has been marked as a duplicate o... 1381267451 \n",
|
||
"717 (In reply to comment #6)\\n> Krinkle, do I need... 1381168024 \n",
|
||
"718 Krinkle, do I need to file a different bug for... 1381142922 \n",
|
||
"... ... ... \n",
|
||
"32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... 1354738560 \n",
|
||
"32178 gerrit-wm is done, but wikibugs is \"an almight... 1360206228 \n",
|
||
"32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. 1354926921 \n",
|
||
"32180 Test case:\\n\\n+ 'removin... 1353134520 \n",
|
||
"32181 With bug 45061 all change marker code has been... 1360975473 \n",
|
||
"\n",
|
||
" speaker meta.affil \\\n",
|
||
"708 PHID-USER-wil4b5lylrvf3krixlkl True \n",
|
||
"709 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"712 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"717 PHID-USER-sai77mtxmpqnm6pycyvz True \n",
|
||
"718 PHID-USER-wil4b5lylrvf3krixlkl True \n",
|
||
"... ... ... \n",
|
||
"32172 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"32178 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"32179 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"32180 PHID-USER-fovtl67ew4l4cc3oeypc False \n",
|
||
"32181 PHID-USER-it53o2f2kyryqyj33uzt False \n",
|
||
"\n",
|
||
" conversation_id comment_type status \\\n",
|
||
"708 PHID-TASK-64s56xzrc22ustp2z7wx task_description resolved \n",
|
||
"709 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n",
|
||
"712 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n",
|
||
"717 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n",
|
||
"718 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n",
|
||
"... ... ... ... \n",
|
||
"32172 PHID-TASK-ciosa56mnibqn4lx27ub task_description resolved \n",
|
||
"32178 PHID-TASK-ciosa56mnibqn4lx27ub task_subcomment NaN \n",
|
||
"32179 PHID-TASK-ciosa56mnibqn4lx27ub task_subcomment NaN \n",
|
||
"32180 PHID-TASK-guukovmsjsnlpphgujcv task_description invalid \n",
|
||
"32181 PHID-TASK-guukovmsjsnlpphgujcv task_subcomment NaN \n",
|
||
"\n",
|
||
" text \\\n",
|
||
"708 Tested on both the Italian and the English Wik... \n",
|
||
"709 Note that this is fixed and has been deployed ... \n",
|
||
"712 *** Bug 55362 has been marked as a duplicate o... \n",
|
||
"717 (In reply to comment #6)\\n> Krinkle, do I need... \n",
|
||
"718 Krinkle, do I need to file a different bug for... \n",
|
||
"... ... \n",
|
||
"32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... \n",
|
||
"32178 gerrit-wm is done, but wikibugs is \"an almight... \n",
|
||
"32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. \n",
|
||
"32180 Test case:\\n\\n+ 'removin... \n",
|
||
"32181 With bug 45061 all change marker code has been... \n",
|
||
"\n",
|
||
" resolved_text meta.gerrit id \\\n",
|
||
"708 Tested on both the Italian and the English Wik... False 709 \n",
|
||
"709 Note that this is fixed and has been deployed ... False 710 \n",
|
||
"712 *** Bug 55362 has been marked as a duplicate o... False 713 \n",
|
||
"717 (In reply to comment #6)\\n> Krinkle, do I need... False 718 \n",
|
||
"718 Krinkle, do Krinkle need to file a different b... False 719 \n",
|
||
"... ... ... ... \n",
|
||
"32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... False 32173 \n",
|
||
"32178 gerrit-wm is done, but wikibugs is \"an almight... False 32179 \n",
|
||
"32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. False 32180 \n",
|
||
"32180 Test case:\\n\\n+ 'removin... False 32181 \n",
|
||
"32181 With bug 45061 all change marker code has been... False 32182 \n",
|
||
"\n",
|
||
" reply_to first_comment timestamp \n",
|
||
"708 NaN False 2013-10-05 12:42:00+00:00 \n",
|
||
"709 709.0 False 2013-10-09 01:10:33+00:00 \n",
|
||
"712 712.0 False 2013-10-08 21:24:11+00:00 \n",
|
||
"717 717.0 False 2013-10-07 17:47:04+00:00 \n",
|
||
"718 718.0 False 2013-10-07 10:48:42+00:00 \n",
|
||
"... ... ... ... \n",
|
||
"32172 NaN False 2012-12-05 20:16:00+00:00 \n",
|
||
"32178 32178.0 False 2013-02-07 03:03:48+00:00 \n",
|
||
"32179 32179.0 False 2012-12-08 00:35:21+00:00 \n",
|
||
"32180 NaN False 2012-11-17 06:42:00+00:00 \n",
|
||
"32181 32181.0 False 2013-02-16 00:44:33+00:00 \n",
|
||
"\n",
|
||
"[8804 rows x 15 columns]"
|
||
]
|
||
},
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"comment_phab_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "d226d781-b002-4842-a3ae-92d4851a5878",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import re\n",
|
||
"\n",
|
||
"def preprocess_text(text):\n",
|
||
" text = str(text)\n",
|
||
" text = text.replace('*', ' ')\n",
|
||
" text = text.replace('-', ' ')\n",
|
||
" text = re.sub(r'http\\S+', '', text)\n",
|
||
" return text"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "3ae40d24-bbe8-49c3-a3a9-70bde1b4d559",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_49967/3649688126.py:1: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" comment_phab_df['processed_text'] = comment_phab_df['text'].apply(preprocess_text)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"comment_phab_df['processed_text'] = comment_phab_df['text'].apply(preprocess_text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "b8eddf40-1fe2-4fce-be74-b32552b40c57",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_49967/1316816771.py:1: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" comment_phab_df['processed_resolved_text'] = comment_phab_df['resolved_text'].apply(preprocess_text)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"comment_phab_df['processed_resolved_text'] = comment_phab_df['resolved_text'].apply(preprocess_text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "a8469b16-4ae6-4b06-bf1b-1f2f6c736cab",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"nlp = spacy.load(\"en_core_web_sm\")\n",
|
||
"\n",
|
||
"def extract_dependency_tree(sentence):\n",
|
||
" doc = nlp(sentence)\n",
|
||
" return [(token.text, token.lemma_, token.dep_, token.head.text, token.ancestors, token.subtree, token.children) for token in doc]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "8b9a12f9-71bf-4bc9-bcfd-c73aab4be920",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_49967/2805711855.py:1: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "337a528a-5667-4e1f-ac9a-37caabc03a18",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_49967/2117289791.py:1: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" comment_phab_df['resolved_dependency_tree'] = comment_phab_df['processed_resolved_text'].apply(extract_dependency_tree)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"comment_phab_df['resolved_dependency_tree'] = comment_phab_df['processed_resolved_text'].apply(extract_dependency_tree)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "a3f5d40b-f56e-4e31-a7f9-40b7ddb4d2a4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#get VAD scores\n",
|
||
"import numpy as np\n",
|
||
"#https://saifmohammad.com/WebPages/nrc-vad.html\n",
|
||
"column_headings = ['Word', 'Valence', 'Arousal', 'Domination']\n",
|
||
"vad_lexicon = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\\t', header=None, names=column_headings)\n",
|
||
"vad_dict = vad_lexicon.set_index('Word').T.to_dict()\n",
|
||
"\n",
|
||
"def vad_scoring(dependency_tree):\n",
|
||
" valence = []\n",
|
||
" arousal = []\n",
|
||
" dominance = []\n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
||
" if lemma in vad_dict:\n",
|
||
" valence.append(vad_dict[lemma]['Valence'])\n",
|
||
" arousal.append(vad_dict[lemma]['Arousal'])\n",
|
||
" dominance.append(vad_dict[lemma]['Domination'])\n",
|
||
"\n",
|
||
" # Compute average scores across the comment\n",
|
||
" avg_valence = np.mean(valence) if valence else 0\n",
|
||
" avg_arousal = np.mean(arousal) if arousal else 0\n",
|
||
" avg_dominance = np.mean(dominance) if dominance else 0\n",
|
||
"\n",
|
||
" return [avg_valence, avg_arousal, avg_dominance]\n",
|
||
"\n",
|
||
"def dominance_prevail(dependency_tree):\n",
|
||
" dominant_words = 0 \n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
||
" if lemma in vad_dict:\n",
|
||
" if vad_dict[lemma]['Domination'] >= 0.75:\n",
|
||
" dominant_words += 1\n",
|
||
" return dominant_words\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"id": "828fb57a-e152-42ef-9c60-660648898532",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#establishing per-comment VAD scores \n",
|
||
"comment_phab_df['avg_vad_scores'] = comment_phab_df['dependency_tree'].apply(vad_scoring)\n",
|
||
"comment_phab_df['dominant_wc'] = comment_phab_df['dependency_tree'].apply(dominance_prevail)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"id": "27e47f6f-0257-4b70-b222-e91ef888c900",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>task_title</th>\n",
|
||
" <th>comment_text</th>\n",
|
||
" <th>date_created</th>\n",
|
||
" <th>speaker</th>\n",
|
||
" <th>meta.affil</th>\n",
|
||
" <th>conversation_id</th>\n",
|
||
" <th>comment_type</th>\n",
|
||
" <th>status</th>\n",
|
||
" <th>text</th>\n",
|
||
" <th>resolved_text</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>first_comment</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>processed_text</th>\n",
|
||
" <th>processed_resolved_text</th>\n",
|
||
" <th>dependency_tree</th>\n",
|
||
" <th>resolved_dependency_tree</th>\n",
|
||
" <th>average_v_score</th>\n",
|
||
" <th>average_a_score</th>\n",
|
||
" <th>average_d_score</th>\n",
|
||
" <th>dominant_wc</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>708</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>Tested on both the Italian and the English Wik...</td>\n",
|
||
" <td>1380976920</td>\n",
|
||
" <td>PHID-USER-wil4b5lylrvf3krixlkl</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>resolved</td>\n",
|
||
" <td>Tested on both the Italian and the English Wik...</td>\n",
|
||
" <td>Tested on both the Italian and the English Wik...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-05 12:42:00+00:00</td>\n",
|
||
" <td>Tested on both the Italian and the English Wik...</td>\n",
|
||
" <td>Tested on both the Italian and the English Wik...</td>\n",
|
||
" <td>[(Tested, test, advcl, Reach, <generator objec...</td>\n",
|
||
" <td>[(Tested, test, advcl, Reach, <generator objec...</td>\n",
|
||
" <td>0.575304</td>\n",
|
||
" <td>0.397913</td>\n",
|
||
" <td>0.475913</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>709</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>Note that this is fixed and has been deployed ...</td>\n",
|
||
" <td>1381281033</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Note that this is fixed and has been deployed ...</td>\n",
|
||
" <td>Note that this is fixed and has been deployed ...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-09 01:10:33+00:00</td>\n",
|
||
" <td>Note that this is fixed and has been deployed ...</td>\n",
|
||
" <td>Note that this is fixed and has been deployed ...</td>\n",
|
||
" <td>[(Note, note, ROOT, Note, <generator object at...</td>\n",
|
||
" <td>[(Note, note, ROOT, Note, <generator object at...</td>\n",
|
||
" <td>0.623100</td>\n",
|
||
" <td>0.422900</td>\n",
|
||
" <td>0.543500</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>712</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>*** Bug 55362 has been marked as a duplicate o...</td>\n",
|
||
" <td>1381267451</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>*** Bug 55362 has been marked as a duplicate o...</td>\n",
|
||
" <td>*** Bug 55362 has been marked as a duplicate o...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-08 21:24:11+00:00</td>\n",
|
||
" <td>Bug 55362 has been marked as a duplicate o...</td>\n",
|
||
" <td>Bug 55362 has been marked as a duplicate o...</td>\n",
|
||
" <td>[( , , dep, Bug, <generator object at 0...</td>\n",
|
||
" <td>[( , , dep, Bug, <generator object at 0...</td>\n",
|
||
" <td>0.501833</td>\n",
|
||
" <td>0.391667</td>\n",
|
||
" <td>0.429500</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>717</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>(In reply to comment #6)\\n> Krinkle, do I need...</td>\n",
|
||
" <td>1381168024</td>\n",
|
||
" <td>PHID-USER-sai77mtxmpqnm6pycyvz</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>(In reply to comment #6)\\n> Krinkle, do I need...</td>\n",
|
||
" <td>(In reply to comment #6)\\n> Krinkle, do I need...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-07 17:47:04+00:00</td>\n",
|
||
" <td>(In reply to comment #6)\\n> Krinkle, do I need...</td>\n",
|
||
" <td>(In reply to comment #6)\\n> Krinkle, do I need...</td>\n",
|
||
" <td>[((, (, punct, comment, <generator object at 0...</td>\n",
|
||
" <td>[((, (, punct, comment, <generator object at 0...</td>\n",
|
||
" <td>0.569450</td>\n",
|
||
" <td>0.405600</td>\n",
|
||
" <td>0.437650</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>718</th>\n",
|
||
" <td>VisualEditor: [Regression] \"More\" menu gets sh...</td>\n",
|
||
" <td>Krinkle, do I need to file a different bug for...</td>\n",
|
||
" <td>1381142922</td>\n",
|
||
" <td>PHID-USER-wil4b5lylrvf3krixlkl</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-64s56xzrc22ustp2z7wx</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Krinkle, do I need to file a different bug for...</td>\n",
|
||
" <td>Krinkle, do Krinkle need to file a different b...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-10-07 10:48:42+00:00</td>\n",
|
||
" <td>Krinkle, do I need to file a different bug for...</td>\n",
|
||
" <td>Krinkle, do Krinkle need to file a different b...</td>\n",
|
||
" <td>[(Krinkle, Krinkle, npadvmod, need, <generator...</td>\n",
|
||
" <td>[(Krinkle, Krinkle, npadvmod, need, <generator...</td>\n",
|
||
" <td>0.614556</td>\n",
|
||
" <td>0.432444</td>\n",
|
||
" <td>0.437667</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32172</th>\n",
|
||
" <td>Setup wikibugs and gerrit-wm for #mediawiki-vi...</td>\n",
|
||
" <td>Puppet config for wikibugs:\\n\\nhttps://gerrit....</td>\n",
|
||
" <td>1354738560</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-ciosa56mnibqn4lx27ub</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>resolved</td>\n",
|
||
" <td>Puppet config for wikibugs:\\n\\nhttps://gerrit....</td>\n",
|
||
" <td>Puppet config for wikibugs:\\n\\nhttps://gerrit....</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2012-12-05 20:16:00+00:00</td>\n",
|
||
" <td>Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf...</td>\n",
|
||
" <td>Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf...</td>\n",
|
||
" <td>[(Puppet, puppet, compound, config, <generator...</td>\n",
|
||
" <td>[(Puppet, puppet, compound, config, <generator...</td>\n",
|
||
" <td>0.525333</td>\n",
|
||
" <td>0.429333</td>\n",
|
||
" <td>0.401333</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32178</th>\n",
|
||
" <td>Setup wikibugs and gerrit-wm for #mediawiki-vi...</td>\n",
|
||
" <td>gerrit-wm is done, but wikibugs is \"an almight...</td>\n",
|
||
" <td>1360206228</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-ciosa56mnibqn4lx27ub</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>gerrit-wm is done, but wikibugs is \"an almight...</td>\n",
|
||
" <td>gerrit-wm is done, but wikibugs is \"an almight...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-02-07 03:03:48+00:00</td>\n",
|
||
" <td>gerrit wm is done, but wikibugs is \"an almight...</td>\n",
|
||
" <td>gerrit wm is done, but wikibugs is \"an almight...</td>\n",
|
||
" <td>[(gerrit, gerrit, compound, wm, <generator obj...</td>\n",
|
||
" <td>[(gerrit, gerrit, compound, wm, <generator obj...</td>\n",
|
||
" <td>0.595818</td>\n",
|
||
" <td>0.512091</td>\n",
|
||
" <td>0.566273</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32179</th>\n",
|
||
" <td>Setup wikibugs and gerrit-wm for #mediawiki-vi...</td>\n",
|
||
" <td>Attempted fixes in Gerrit 37566 and Gerrit 37570.</td>\n",
|
||
" <td>1354926921</td>\n",
|
||
" <td>PHID-USER-ydswvwhh5pm4lshahjje</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>PHID-TASK-ciosa56mnibqn4lx27ub</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Attempted fixes in Gerrit 37566 and Gerrit 37570.</td>\n",
|
||
" <td>Attempted fixes in Gerrit 37566 and Gerrit 37570.</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2012-12-08 00:35:21+00:00</td>\n",
|
||
" <td>Attempted fixes in Gerrit 37566 and Gerrit 37570.</td>\n",
|
||
" <td>Attempted fixes in Gerrit 37566 and Gerrit 37570.</td>\n",
|
||
" <td>[(Attempted, attempt, amod, fixes, <generator ...</td>\n",
|
||
" <td>[(Attempted, attempt, amod, fixes, <generator ...</td>\n",
|
||
" <td>0.692500</td>\n",
|
||
" <td>0.514500</td>\n",
|
||
" <td>0.475000</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32180</th>\n",
|
||
" <td>VisualEditor: Two replacements within the same...</td>\n",
|
||
" <td>Test case:\\n\\n+ 'removin...</td>\n",
|
||
" <td>1353134520</td>\n",
|
||
" <td>PHID-USER-fovtl67ew4l4cc3oeypc</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-guukovmsjsnlpphgujcv</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>invalid</td>\n",
|
||
" <td>Test case:\\n\\n+ 'removin...</td>\n",
|
||
" <td>Test case:\\n\\n+ 'removin...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2012-11-17 06:42:00+00:00</td>\n",
|
||
" <td>Test case:\\n\\n+ 'removin...</td>\n",
|
||
" <td>Test case:\\n\\n+ 'removin...</td>\n",
|
||
" <td>[(Test, test, compound, case, <generator objec...</td>\n",
|
||
" <td>[(Test, test, compound, case, <generator objec...</td>\n",
|
||
" <td>0.567509</td>\n",
|
||
" <td>0.448561</td>\n",
|
||
" <td>0.535053</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32181</th>\n",
|
||
" <td>VisualEditor: Two replacements within the same...</td>\n",
|
||
" <td>With bug 45061 all change marker code has been...</td>\n",
|
||
" <td>1360975473</td>\n",
|
||
" <td>PHID-USER-it53o2f2kyryqyj33uzt</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-guukovmsjsnlpphgujcv</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>With bug 45061 all change marker code has been...</td>\n",
|
||
" <td>With bug 45061 all change marker code has been...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>2013-02-16 00:44:33+00:00</td>\n",
|
||
" <td>With bug 45061 all change marker code has been...</td>\n",
|
||
" <td>With bug 45061 all change marker code has been...</td>\n",
|
||
" <td>[(With, with, prep, change, <generator object ...</td>\n",
|
||
" <td>[(With, with, prep, change, <generator object ...</td>\n",
|
||
" <td>0.530429</td>\n",
|
||
" <td>0.412000</td>\n",
|
||
" <td>0.509571</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8804 rows × 23 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" task_title \\\n",
|
||
"708 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"709 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"712 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"717 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"718 VisualEditor: [Regression] \"More\" menu gets sh... \n",
|
||
"... ... \n",
|
||
"32172 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n",
|
||
"32178 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n",
|
||
"32179 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n",
|
||
"32180 VisualEditor: Two replacements within the same... \n",
|
||
"32181 VisualEditor: Two replacements within the same... \n",
|
||
"\n",
|
||
" comment_text date_created \\\n",
|
||
"708 Tested on both the Italian and the English Wik... 1380976920 \n",
|
||
"709 Note that this is fixed and has been deployed ... 1381281033 \n",
|
||
"712 *** Bug 55362 has been marked as a duplicate o... 1381267451 \n",
|
||
"717 (In reply to comment #6)\\n> Krinkle, do I need... 1381168024 \n",
|
||
"718 Krinkle, do I need to file a different bug for... 1381142922 \n",
|
||
"... ... ... \n",
|
||
"32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... 1354738560 \n",
|
||
"32178 gerrit-wm is done, but wikibugs is \"an almight... 1360206228 \n",
|
||
"32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. 1354926921 \n",
|
||
"32180 Test case:\\n\\n+ 'removin... 1353134520 \n",
|
||
"32181 With bug 45061 all change marker code has been... 1360975473 \n",
|
||
"\n",
|
||
" speaker meta.affil \\\n",
|
||
"708 PHID-USER-wil4b5lylrvf3krixlkl True \n",
|
||
"709 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"712 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"717 PHID-USER-sai77mtxmpqnm6pycyvz True \n",
|
||
"718 PHID-USER-wil4b5lylrvf3krixlkl True \n",
|
||
"... ... ... \n",
|
||
"32172 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"32178 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"32179 PHID-USER-ydswvwhh5pm4lshahjje True \n",
|
||
"32180 PHID-USER-fovtl67ew4l4cc3oeypc False \n",
|
||
"32181 PHID-USER-it53o2f2kyryqyj33uzt False \n",
|
||
"\n",
|
||
" conversation_id comment_type status \\\n",
|
||
"708 PHID-TASK-64s56xzrc22ustp2z7wx task_description resolved \n",
|
||
"709 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n",
|
||
"712 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n",
|
||
"717 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n",
|
||
"718 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n",
|
||
"... ... ... ... \n",
|
||
"32172 PHID-TASK-ciosa56mnibqn4lx27ub task_description resolved \n",
|
||
"32178 PHID-TASK-ciosa56mnibqn4lx27ub task_subcomment NaN \n",
|
||
"32179 PHID-TASK-ciosa56mnibqn4lx27ub task_subcomment NaN \n",
|
||
"32180 PHID-TASK-guukovmsjsnlpphgujcv task_description invalid \n",
|
||
"32181 PHID-TASK-guukovmsjsnlpphgujcv task_subcomment NaN \n",
|
||
"\n",
|
||
" text \\\n",
|
||
"708 Tested on both the Italian and the English Wik... \n",
|
||
"709 Note that this is fixed and has been deployed ... \n",
|
||
"712 *** Bug 55362 has been marked as a duplicate o... \n",
|
||
"717 (In reply to comment #6)\\n> Krinkle, do I need... \n",
|
||
"718 Krinkle, do I need to file a different bug for... \n",
|
||
"... ... \n",
|
||
"32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... \n",
|
||
"32178 gerrit-wm is done, but wikibugs is \"an almight... \n",
|
||
"32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. \n",
|
||
"32180 Test case:\\n\\n+ 'removin... \n",
|
||
"32181 With bug 45061 all change marker code has been... \n",
|
||
"\n",
|
||
" resolved_text ... first_comment \\\n",
|
||
"708 Tested on both the Italian and the English Wik... ... False \n",
|
||
"709 Note that this is fixed and has been deployed ... ... False \n",
|
||
"712 *** Bug 55362 has been marked as a duplicate o... ... False \n",
|
||
"717 (In reply to comment #6)\\n> Krinkle, do I need... ... False \n",
|
||
"718 Krinkle, do Krinkle need to file a different b... ... False \n",
|
||
"... ... ... ... \n",
|
||
"32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... ... False \n",
|
||
"32178 gerrit-wm is done, but wikibugs is \"an almight... ... False \n",
|
||
"32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. ... False \n",
|
||
"32180 Test case:\\n\\n+ 'removin... ... False \n",
|
||
"32181 With bug 45061 all change marker code has been... ... False \n",
|
||
"\n",
|
||
" timestamp \\\n",
|
||
"708 2013-10-05 12:42:00+00:00 \n",
|
||
"709 2013-10-09 01:10:33+00:00 \n",
|
||
"712 2013-10-08 21:24:11+00:00 \n",
|
||
"717 2013-10-07 17:47:04+00:00 \n",
|
||
"718 2013-10-07 10:48:42+00:00 \n",
|
||
"... ... \n",
|
||
"32172 2012-12-05 20:16:00+00:00 \n",
|
||
"32178 2013-02-07 03:03:48+00:00 \n",
|
||
"32179 2012-12-08 00:35:21+00:00 \n",
|
||
"32180 2012-11-17 06:42:00+00:00 \n",
|
||
"32181 2013-02-16 00:44:33+00:00 \n",
|
||
"\n",
|
||
" processed_text \\\n",
|
||
"708 Tested on both the Italian and the English Wik... \n",
|
||
"709 Note that this is fixed and has been deployed ... \n",
|
||
"712 Bug 55362 has been marked as a duplicate o... \n",
|
||
"717 (In reply to comment #6)\\n> Krinkle, do I need... \n",
|
||
"718 Krinkle, do I need to file a different bug for... \n",
|
||
"... ... \n",
|
||
"32172 Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf... \n",
|
||
"32178 gerrit wm is done, but wikibugs is \"an almight... \n",
|
||
"32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. \n",
|
||
"32180 Test case:\\n\\n+ 'removin... \n",
|
||
"32181 With bug 45061 all change marker code has been... \n",
|
||
"\n",
|
||
" processed_resolved_text \\\n",
|
||
"708 Tested on both the Italian and the English Wik... \n",
|
||
"709 Note that this is fixed and has been deployed ... \n",
|
||
"712 Bug 55362 has been marked as a duplicate o... \n",
|
||
"717 (In reply to comment #6)\\n> Krinkle, do I need... \n",
|
||
"718 Krinkle, do Krinkle need to file a different b... \n",
|
||
"... ... \n",
|
||
"32172 Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf... \n",
|
||
"32178 gerrit wm is done, but wikibugs is \"an almight... \n",
|
||
"32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. \n",
|
||
"32180 Test case:\\n\\n+ 'removin... \n",
|
||
"32181 With bug 45061 all change marker code has been... \n",
|
||
"\n",
|
||
" dependency_tree \\\n",
|
||
"708 [(Tested, test, advcl, Reach, <generator objec... \n",
|
||
"709 [(Note, note, ROOT, Note, <generator object at... \n",
|
||
"712 [( , , dep, Bug, <generator object at 0... \n",
|
||
"717 [((, (, punct, comment, <generator object at 0... \n",
|
||
"718 [(Krinkle, Krinkle, npadvmod, need, <generator... \n",
|
||
"... ... \n",
|
||
"32172 [(Puppet, puppet, compound, config, <generator... \n",
|
||
"32178 [(gerrit, gerrit, compound, wm, <generator obj... \n",
|
||
"32179 [(Attempted, attempt, amod, fixes, <generator ... \n",
|
||
"32180 [(Test, test, compound, case, <generator objec... \n",
|
||
"32181 [(With, with, prep, change, <generator object ... \n",
|
||
"\n",
|
||
" resolved_dependency_tree average_v_score \\\n",
|
||
"708 [(Tested, test, advcl, Reach, <generator objec... 0.575304 \n",
|
||
"709 [(Note, note, ROOT, Note, <generator object at... 0.623100 \n",
|
||
"712 [( , , dep, Bug, <generator object at 0... 0.501833 \n",
|
||
"717 [((, (, punct, comment, <generator object at 0... 0.569450 \n",
|
||
"718 [(Krinkle, Krinkle, npadvmod, need, <generator... 0.614556 \n",
|
||
"... ... ... \n",
|
||
"32172 [(Puppet, puppet, compound, config, <generator... 0.525333 \n",
|
||
"32178 [(gerrit, gerrit, compound, wm, <generator obj... 0.595818 \n",
|
||
"32179 [(Attempted, attempt, amod, fixes, <generator ... 0.692500 \n",
|
||
"32180 [(Test, test, compound, case, <generator objec... 0.567509 \n",
|
||
"32181 [(With, with, prep, change, <generator object ... 0.530429 \n",
|
||
"\n",
|
||
" average_a_score average_d_score dominant_wc \n",
|
||
"708 0.397913 0.475913 2 \n",
|
||
"709 0.422900 0.543500 0 \n",
|
||
"712 0.391667 0.429500 0 \n",
|
||
"717 0.405600 0.437650 1 \n",
|
||
"718 0.432444 0.437667 1 \n",
|
||
"... ... ... ... \n",
|
||
"32172 0.429333 0.401333 0 \n",
|
||
"32178 0.512091 0.566273 3 \n",
|
||
"32179 0.514500 0.475000 0 \n",
|
||
"32180 0.448561 0.535053 4 \n",
|
||
"32181 0.412000 0.509571 0 \n",
|
||
"\n",
|
||
"[8804 rows x 23 columns]"
|
||
]
|
||
},
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"comment_phab_df[['average_v_score', 'average_a_score', 'average_d_score']] = pd.DataFrame(comment_phab_df['avg_vad_scores'].tolist(), index=comment_phab_df.index)\n",
|
||
"comment_phab_df = comment_phab_df.drop(columns=['avg_vad_scores'])\n",
|
||
"comment_phab_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "184ccbe6-0a7a-41b8-9b02-bc439ff975d0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# expand the dependency parser \n",
|
||
"dependency_relations = []\n",
|
||
"resolved_dependency_relations = []\n",
|
||
"\n",
|
||
"for index, row in comment_phab_df.iterrows():\n",
|
||
" text = row['text']\n",
|
||
" timestamp = row['timestamp']\n",
|
||
" comment_id = row['id']\n",
|
||
" conversation_id = row['conversation_id']\n",
|
||
" WMFaffil = row['meta.affil']\n",
|
||
" \n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in row['dependency_tree']:\n",
|
||
" if re.search(r'\\b(visualeditor|VE|ve|VisualEditor)\\b', token, re.IGNORECASE):\n",
|
||
" dependency_relations.append({\n",
|
||
" 'comment_id': comment_id,\n",
|
||
" 'timestamp': timestamp,\n",
|
||
" 'wmfAffil':WMFaffil,\n",
|
||
" 'token': token,\n",
|
||
" 'dependency': dep,\n",
|
||
" 'head': head,\n",
|
||
" 'depth': len(list(ancestors)), \n",
|
||
" 'children': len(list(children)) \n",
|
||
" })\n",
|
||
" for token, lemma, dep, head, ancestors, subtree, children in row['resolved_dependency_tree']:\n",
|
||
" if re.search(r'\\b(visualeditor|VE|ve|VisualEditor)\\b', token, re.IGNORECASE):\n",
|
||
" resolved_dependency_relations.append({\n",
|
||
" 'comment_id': comment_id,\n",
|
||
" 'timestamp': timestamp,\n",
|
||
" 'wmfAffil':WMFaffil,\n",
|
||
" 'token': token,\n",
|
||
" 'dependency': dep,\n",
|
||
" 'head': head,\n",
|
||
" 'depth': len(list(ancestors)), \n",
|
||
" 'children': len(list(children)) \n",
|
||
" })\n",
|
||
"\n",
|
||
"resolved_dependency_relations_df = pd.DataFrame(resolved_dependency_relations) \n",
|
||
"dependency_relations_df = pd.DataFrame(dependency_relations)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"id": "82498686-14f4-40c8-9e33-27b31f115b47",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#now analysis/plotting \n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from matplotlib.gridspec import GridSpec"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 60,
|
||
"id": "82cd9dde-0d14-4de5-8482-5a39de8d2869",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_49967/3248547585.py:3: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" task_phab_df['first_comment'] = task_phab_df.groupby('speaker')['timestamp'].rank(method='first') == 1\n",
|
||
"/tmp/ipykernel_49967/3248547585.py:6: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
|
||
" task_phab_df['week'] = task_phab_df['timestamp'].dt.to_period('W').dt.start_time\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1000x600 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"plt.figure(figsize=(10, 6))\n",
|
||
"task_phab_df = phab_df[phab_df['comment_type']==\"task_description\"]\n",
|
||
"task_phab_df['first_comment'] = task_phab_df.groupby('speaker')['timestamp'].rank(method='first') == 1\n",
|
||
"task_phab_df = task_phab_df[(task_phab_df['date_created'] < 1383264000) & (task_phab_df['date_created'] > 1351728000)]\n",
|
||
"\n",
|
||
"task_phab_df['week'] = task_phab_df['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"unique_taskPHIDs = task_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
||
"\n",
|
||
"wmf_task_phab_df = task_phab_df[task_phab_df['meta.affil'] == True]\n",
|
||
"wmf_tasks = wmf_task_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
||
"\n",
|
||
"other_task_phab_df = task_phab_df[task_phab_df['meta.affil'] != True]\n",
|
||
"other_tasks = other_task_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
||
"\n",
|
||
"new_tasks_phab_df = task_phab_df[task_phab_df['first_comment'] == True]\n",
|
||
"new_tasks = new_tasks_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
||
"\n",
|
||
"sns.lineplot(x=unique_taskPHIDs.index, y=unique_taskPHIDs.values, color='black', label='Total', marker='o')\n",
|
||
"sns.lineplot(x=wmf_tasks.index, y=wmf_tasks.values, color='#c7756a', label='WMF-affiliated authors', marker='o')\n",
|
||
"sns.lineplot(x=other_tasks.index, y=other_tasks.values, color='#5da2d8', label='Nonaffiliated authors', marker='o')\n",
|
||
"sns.lineplot(x=new_tasks.index, y=new_tasks.values, color=\"green\", label=\"first-timers\", marker='o')\n",
|
||
"\n",
|
||
"plt.title('New Phabricator Tasks Indexed with \"VisualEditor\"')\n",
|
||
"plt.xlabel('Timestamp')\n",
|
||
"plt.ylabel('Unique taskPHIDs')\n",
|
||
"plt.xticks(rotation=45)\n",
|
||
"plt.grid(True)\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"id": "b7cfad77-d48a-4708-91f3-89ae1179b90c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Figure size 1000x600 with 0 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1096.11x500 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"comment_phab_df['before_after'] = comment_phab_df['timestamp'] > pd.Timestamp('2013-07-01 00:00:01+00:00')\n",
|
||
"plt.figure(figsize=(10, 6))\n",
|
||
"\n",
|
||
"sns.lmplot(data=comment_phab_df, x=\"date_created\", y=\"dominant_wc\", hue=\"before_after\", col=\"meta.affil\", scatter=False)\n",
|
||
"\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"id": "d2d67d38-f005-4c94-be3c-39eb6b22686f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_49967/3455565877.py:2: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
|
||
" filtered_dependencies = dependency_relations_df[dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
||
"/tmp/ipykernel_49967/3455565877.py:3: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
|
||
" resolved_filtered_dependencies = resolved_dependency_relations_df[resolved_dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
||
"/tmp/ipykernel_49967/3455565877.py:18: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
|
||
" filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"/tmp/ipykernel_49967/3455565877.py:18: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"/tmp/ipykernel_49967/3455565877.py:37: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
|
||
" resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"/tmp/ipykernel_49967/3455565877.py:37: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"/tmp/ipykernel_49967/3455565877.py:40: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
|
||
" resolved_wmf_filtered_dependencies = resolved_filtered_dependencies[filtered_dependencies['wmfAffil'] == True]\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "IndexingError",
|
||
"evalue": "Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[0;31mIndexingError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[0;32mIn[53], line 40\u001b[0m\n\u001b[1;32m 37\u001b[0m resolved_filtered_dependencies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mweek\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m resolved_filtered_dependencies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mdt\u001b[38;5;241m.\u001b[39mto_period(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mW\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mdt\u001b[38;5;241m.\u001b[39mstart_time\n\u001b[1;32m 38\u001b[0m resolved_median_depth \u001b[38;5;241m=\u001b[39m resolved_filtered_dependencies\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mweek\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdepth\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mmedian()\u001b[38;5;241m.\u001b[39mreset_index()\n\u001b[0;32m---> 40\u001b[0m resolved_wmf_filtered_dependencies \u001b[38;5;241m=\u001b[39m \u001b[43mresolved_filtered_dependencies\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfiltered_dependencies\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwmfAffil\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m]\u001b[49m\n\u001b[1;32m 41\u001b[0m resolved_wmf_median_depth \u001b[38;5;241m=\u001b[39m resolved_wmf_filtered_dependencies\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mweek\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdepth\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mmedian()\u001b[38;5;241m.\u001b[39mreset_index()\n\u001b[1;32m 43\u001b[0m resolved_other_filtered_dependencies \u001b[38;5;241m=\u001b[39m resolved_filtered_dependencies[filtered_dependencies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwmfAffil\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m]\n",
|
||
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/pandas/core/frame.py:4093\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;66;03m# Do we have a (boolean) 1d indexer?\u001b[39;00m\n\u001b[1;32m 4092\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m com\u001b[38;5;241m.\u001b[39mis_bool_indexer(key):\n\u001b[0;32m-> 4093\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem_bool_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4095\u001b[0m \u001b[38;5;66;03m# We are left with two options: a single key, and a collection of keys,\u001b[39;00m\n\u001b[1;32m 4096\u001b[0m \u001b[38;5;66;03m# We interpret tuples as collections only for non-MultiIndex\u001b[39;00m\n\u001b[1;32m 4097\u001b[0m is_single_key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_list_like(key)\n",
|
||
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/pandas/core/frame.py:4149\u001b[0m, in \u001b[0;36mDataFrame._getitem_bool_array\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4143\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 4144\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mItem wrong length \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(key)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m instead of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4145\u001b[0m )\n\u001b[1;32m 4147\u001b[0m \u001b[38;5;66;03m# check_bool_indexer will throw exception if Series key cannot\u001b[39;00m\n\u001b[1;32m 4148\u001b[0m \u001b[38;5;66;03m# be reindexed to match DataFrame rows\u001b[39;00m\n\u001b[0;32m-> 4149\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_bool_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4151\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mall():\n\u001b[1;32m 4152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcopy(deep\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n",
|
||
"File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/pandas/core/indexing.py:2662\u001b[0m, in \u001b[0;36mcheck_bool_indexer\u001b[0;34m(index, key)\u001b[0m\n\u001b[1;32m 2660\u001b[0m indexer \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mget_indexer_for(index)\n\u001b[1;32m 2661\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01min\u001b[39;00m indexer:\n\u001b[0;32m-> 2662\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m IndexingError(\n\u001b[1;32m 2663\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnalignable boolean Series provided as \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2664\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindexer (index of the boolean Series and of \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2665\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe indexed object do not match).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2666\u001b[0m )\n\u001b[1;32m 2668\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 2670\u001b[0m \u001b[38;5;66;03m# fall through for boolean\u001b[39;00m\n",
|
||
"\u001b[0;31mIndexingError\u001b[0m: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match)."
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1200x800 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"pattern = r'\\b(ve|VE|visualeditor|VisualEditor)\\b'\n",
|
||
"filtered_dependencies = dependency_relations_df[dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
||
"resolved_filtered_dependencies = resolved_dependency_relations_df[resolved_dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
||
"\n",
|
||
"plt.figure(figsize=(12, 8))\n",
|
||
"gs = GridSpec(2, 1, height_ratios=[6, 6])\n",
|
||
"\n",
|
||
"# Main plot: Token depth by timestamp\n",
|
||
"'''\n",
|
||
"ax0 = plt.subplot(gs[0])\n",
|
||
"sns.scatterplot(data=filtered_dependencies, x='timestamp', y='dependency', hue='wmfAffil', style='dependency', markers=True, s=100, ax=ax0)\n",
|
||
"ax0.set_title('VE Depth by Timestamp w/o URLS')\n",
|
||
"ax0.set_xlabel('')\n",
|
||
"ax0.set_ylabel('Dependency Type')\n",
|
||
"ax0.legend().set_visible(False)\n",
|
||
"'''\n",
|
||
"# Calculate the median depth over time\n",
|
||
"filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"median_depth = filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"wmf_filtered_dependencies = filtered_dependencies[filtered_dependencies['wmfAffil'] == True]\n",
|
||
"wmf_median_depth = wmf_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"other_filtered_dependencies = filtered_dependencies[filtered_dependencies['wmfAffil'] != True]\n",
|
||
"other_median_depth = other_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"# Plot the median depth over time\n",
|
||
"ax0 = plt.subplot(gs[0])\n",
|
||
"sns.lineplot(data=median_depth, x='week', y='depth', ax=ax0, color='black', label='Median Depth', marker='o')\n",
|
||
"sns.lineplot(data=wmf_median_depth, x='week', y='depth', ax=ax0, color='#c7756a', label='WMF-affiliated authors', marker='x')\n",
|
||
"sns.lineplot(data=other_median_depth, x='week', y='depth', ax=ax0, color='#5da2d8', label='Nonaffiliated authors', marker='x')\n",
|
||
"ax0.set_title('Median Depth of VE in Phabricator Sentence Dependency Trees')\n",
|
||
"ax0.set_ylabel('Median Depth')\n",
|
||
"ax0.set_xlabel('')\n",
|
||
"\n",
|
||
"# Calculate the median depth over time\n",
|
||
"resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
||
"resolved_median_depth = resolved_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"resolved_wmf_filtered_dependencies = resolved_filtered_dependencies[filtered_dependencies['wmfAffil'] == True]\n",
|
||
"resolved_wmf_median_depth = resolved_wmf_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"resolved_other_filtered_dependencies = resolved_filtered_dependencies[filtered_dependencies['wmfAffil'] != True]\n",
|
||
"resolved_other_median_depth = resolved_other_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
||
"\n",
|
||
"# Plot the median depth over time\n",
|
||
"ax1 = plt.subplot(gs[1])\n",
|
||
"sns.lineplot(data=resolved_median_depth, x='week', y='depth', ax=ax1, color='black', label='Median Depth', marker='o')\n",
|
||
"sns.lineplot(data=resolved_wmf_median_depth, x='week', y='depth', ax=ax1, color='#c7756a', label='WMF-affiliated authors', marker='x')\n",
|
||
"sns.lineplot(data=resolved_other_median_depth, x='week', y='depth', ax=ax1, color='#5da2d8', label='Nonaffiliated authors', marker='x')\n",
|
||
"ax1.set_title('Median Depth of VE in Coreference-resolved Phabricator Sentence Dependency Trees')\n",
|
||
"ax1.set_ylabel('Median Depth')\n",
|
||
"ax1.set_xlabel('')\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.9.21"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|