{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ba9e5acd-e17d-4318-9272-04c9f6706186", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import spacy" ] }, { "cell_type": "code", "execution_count": 13, "id": "e4f0b3f0-5255-46f1-822f-e455087ba315", "metadata": {}, "outputs": [], "source": [ "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0312_resolved_ve_phab_comments.csv\"\n", "phab_df = pd.read_csv(phab_path)" ] }, { "cell_type": "code", "execution_count": 14, "id": "d449164e-1d28-4580-9eb1-f0f69978f114", "metadata": {}, "outputs": [], "source": [ "#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n", "phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n", "#cleaning df\n", "phab_df['id'] = phab_df.index + 1\n", "#may have to build out the reply_to column \n", "phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n", "phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n", "\n", "phab_df = phab_df.rename(columns={\n", " 'AuthorPHID': 'speaker',\n", " 'TaskPHID': 'conversation_id',\n", " 'WMFaffil':'meta.affil',\n", " 'isGerrit': 'meta.gerrit'\n", "})\n", "\n", "# after 11-1-2012 before 11-1-2013\n", "phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n", "filtered_phab_df = phab_df[(phab_df['date_created'] < 1383264000) & (phab_df['date_created'] > 1351728000)]\n", "\n", "#removing headless conversations\n", "task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n", "headed_task_phids = task_phab_df['conversation_id'].unique()\n", "filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n", "\n", "#removing gerrit comments \n", "comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]" ] }, { "cell_type": "code", "execution_count": 15, "id": "942344db-c8f5-4ed6-a757-c97f8454f18b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique conversation_ids: 2081\n", "Unique ids: 8804\n", "Unique speakers: 230\n" ] } ], "source": [ "unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n", "unique_ids = len(comment_phab_df['id'].unique())\n", "unique_speakers = len(comment_phab_df['speaker'].unique())\n", "\n", "print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n", "print(f\"Unique ids: {unique_ids}\")\n", "print(f\"Unique speakers: {unique_speakers}\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "c0aade6b-f425-4f9b-ae2a-721ea49712ee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | task_title | \n", "comment_text | \n", "date_created | \n", "speaker | \n", "meta.affil | \n", "conversation_id | \n", "comment_type | \n", "status | \n", "text | \n", "resolved_text | \n", "meta.gerrit | \n", "id | \n", "reply_to | \n", "first_comment | \n", "timestamp | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
708 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Tested on both the Italian and the English Wik... | \n", "1380976920 | \n", "PHID-USER-wil4b5lylrvf3krixlkl | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_description | \n", "resolved | \n", "Tested on both the Italian and the English Wik... | \n", "Tested on both the Italian and the English Wik... | \n", "False | \n", "709 | \n", "NaN | \n", "False | \n", "2013-10-05 12:42:00+00:00 | \n", "
709 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Note that this is fixed and has been deployed ... | \n", "1381281033 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "Note that this is fixed and has been deployed ... | \n", "Note that this is fixed and has been deployed ... | \n", "False | \n", "710 | \n", "709.0 | \n", "False | \n", "2013-10-09 01:10:33+00:00 | \n", "
712 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "*** Bug 55362 has been marked as a duplicate o... | \n", "1381267451 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "*** Bug 55362 has been marked as a duplicate o... | \n", "*** Bug 55362 has been marked as a duplicate o... | \n", "False | \n", "713 | \n", "712.0 | \n", "False | \n", "2013-10-08 21:24:11+00:00 | \n", "
717 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "1381168024 | \n", "PHID-USER-sai77mtxmpqnm6pycyvz | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "False | \n", "718 | \n", "717.0 | \n", "False | \n", "2013-10-07 17:47:04+00:00 | \n", "
718 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Krinkle, do I need to file a different bug for... | \n", "1381142922 | \n", "PHID-USER-wil4b5lylrvf3krixlkl | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "Krinkle, do I need to file a different bug for... | \n", "Krinkle, do Krinkle need to file a different b... | \n", "False | \n", "719 | \n", "718.0 | \n", "False | \n", "2013-10-07 10:48:42+00:00 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
32172 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "Puppet config for wikibugs:\\n\\nhttps://gerrit.... | \n", "1354738560 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_description | \n", "resolved | \n", "Puppet config for wikibugs:\\n\\nhttps://gerrit.... | \n", "Puppet config for wikibugs:\\n\\nhttps://gerrit.... | \n", "False | \n", "32173 | \n", "NaN | \n", "False | \n", "2012-12-05 20:16:00+00:00 | \n", "
32178 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "gerrit-wm is done, but wikibugs is \"an almight... | \n", "1360206228 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_subcomment | \n", "NaN | \n", "gerrit-wm is done, but wikibugs is \"an almight... | \n", "gerrit-wm is done, but wikibugs is \"an almight... | \n", "False | \n", "32179 | \n", "32178.0 | \n", "False | \n", "2013-02-07 03:03:48+00:00 | \n", "
32179 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "1354926921 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_subcomment | \n", "NaN | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "False | \n", "32180 | \n", "32179.0 | \n", "False | \n", "2012-12-08 00:35:21+00:00 | \n", "
32180 | \n", "VisualEditor: Two replacements within the same... | \n", "Test case:\\n\\n+ 'removin... | \n", "1353134520 | \n", "PHID-USER-fovtl67ew4l4cc3oeypc | \n", "False | \n", "PHID-TASK-guukovmsjsnlpphgujcv | \n", "task_description | \n", "invalid | \n", "Test case:\\n\\n+ 'removin... | \n", "Test case:\\n\\n+ 'removin... | \n", "False | \n", "32181 | \n", "NaN | \n", "False | \n", "2012-11-17 06:42:00+00:00 | \n", "
32181 | \n", "VisualEditor: Two replacements within the same... | \n", "With bug 45061 all change marker code has been... | \n", "1360975473 | \n", "PHID-USER-it53o2f2kyryqyj33uzt | \n", "False | \n", "PHID-TASK-guukovmsjsnlpphgujcv | \n", "task_subcomment | \n", "NaN | \n", "With bug 45061 all change marker code has been... | \n", "With bug 45061 all change marker code has been... | \n", "False | \n", "32182 | \n", "32181.0 | \n", "False | \n", "2013-02-16 00:44:33+00:00 | \n", "
8804 rows × 15 columns
\n", "\n", " | task_title | \n", "comment_text | \n", "date_created | \n", "speaker | \n", "meta.affil | \n", "conversation_id | \n", "comment_type | \n", "status | \n", "text | \n", "resolved_text | \n", "... | \n", "first_comment | \n", "timestamp | \n", "processed_text | \n", "processed_resolved_text | \n", "dependency_tree | \n", "resolved_dependency_tree | \n", "average_v_score | \n", "average_a_score | \n", "average_d_score | \n", "dominant_wc | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
708 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Tested on both the Italian and the English Wik... | \n", "1380976920 | \n", "PHID-USER-wil4b5lylrvf3krixlkl | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_description | \n", "resolved | \n", "Tested on both the Italian and the English Wik... | \n", "Tested on both the Italian and the English Wik... | \n", "... | \n", "False | \n", "2013-10-05 12:42:00+00:00 | \n", "Tested on both the Italian and the English Wik... | \n", "Tested on both the Italian and the English Wik... | \n", "[(Tested, test, advcl, Reach, <generator objec... | \n", "[(Tested, test, advcl, Reach, <generator objec... | \n", "0.575304 | \n", "0.397913 | \n", "0.475913 | \n", "2 | \n", "
709 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Note that this is fixed and has been deployed ... | \n", "1381281033 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "Note that this is fixed and has been deployed ... | \n", "Note that this is fixed and has been deployed ... | \n", "... | \n", "False | \n", "2013-10-09 01:10:33+00:00 | \n", "Note that this is fixed and has been deployed ... | \n", "Note that this is fixed and has been deployed ... | \n", "[(Note, note, ROOT, Note, <generator object at... | \n", "[(Note, note, ROOT, Note, <generator object at... | \n", "0.623100 | \n", "0.422900 | \n", "0.543500 | \n", "0 | \n", "
712 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "*** Bug 55362 has been marked as a duplicate o... | \n", "1381267451 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "*** Bug 55362 has been marked as a duplicate o... | \n", "*** Bug 55362 has been marked as a duplicate o... | \n", "... | \n", "False | \n", "2013-10-08 21:24:11+00:00 | \n", "Bug 55362 has been marked as a duplicate o... | \n", "Bug 55362 has been marked as a duplicate o... | \n", "[( , , dep, Bug, <generator object at 0... | \n", "[( , , dep, Bug, <generator object at 0... | \n", "0.501833 | \n", "0.391667 | \n", "0.429500 | \n", "0 | \n", "
717 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "1381168024 | \n", "PHID-USER-sai77mtxmpqnm6pycyvz | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "... | \n", "False | \n", "2013-10-07 17:47:04+00:00 | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "[((, (, punct, comment, <generator object at 0... | \n", "[((, (, punct, comment, <generator object at 0... | \n", "0.569450 | \n", "0.405600 | \n", "0.437650 | \n", "1 | \n", "
718 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Krinkle, do I need to file a different bug for... | \n", "1381142922 | \n", "PHID-USER-wil4b5lylrvf3krixlkl | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "Krinkle, do I need to file a different bug for... | \n", "Krinkle, do Krinkle need to file a different b... | \n", "... | \n", "False | \n", "2013-10-07 10:48:42+00:00 | \n", "Krinkle, do I need to file a different bug for... | \n", "Krinkle, do Krinkle need to file a different b... | \n", "[(Krinkle, Krinkle, npadvmod, need, <generator... | \n", "[(Krinkle, Krinkle, npadvmod, need, <generator... | \n", "0.614556 | \n", "0.432444 | \n", "0.437667 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
32172 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "Puppet config for wikibugs:\\n\\nhttps://gerrit.... | \n", "1354738560 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_description | \n", "resolved | \n", "Puppet config for wikibugs:\\n\\nhttps://gerrit.... | \n", "Puppet config for wikibugs:\\n\\nhttps://gerrit.... | \n", "... | \n", "False | \n", "2012-12-05 20:16:00+00:00 | \n", "Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf... | \n", "Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf... | \n", "[(Puppet, puppet, compound, config, <generator... | \n", "[(Puppet, puppet, compound, config, <generator... | \n", "0.525333 | \n", "0.429333 | \n", "0.401333 | \n", "0 | \n", "
32178 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "gerrit-wm is done, but wikibugs is \"an almight... | \n", "1360206228 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_subcomment | \n", "NaN | \n", "gerrit-wm is done, but wikibugs is \"an almight... | \n", "gerrit-wm is done, but wikibugs is \"an almight... | \n", "... | \n", "False | \n", "2013-02-07 03:03:48+00:00 | \n", "gerrit wm is done, but wikibugs is \"an almight... | \n", "gerrit wm is done, but wikibugs is \"an almight... | \n", "[(gerrit, gerrit, compound, wm, <generator obj... | \n", "[(gerrit, gerrit, compound, wm, <generator obj... | \n", "0.595818 | \n", "0.512091 | \n", "0.566273 | \n", "3 | \n", "
32179 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "1354926921 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_subcomment | \n", "NaN | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "... | \n", "False | \n", "2012-12-08 00:35:21+00:00 | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "[(Attempted, attempt, amod, fixes, <generator ... | \n", "[(Attempted, attempt, amod, fixes, <generator ... | \n", "0.692500 | \n", "0.514500 | \n", "0.475000 | \n", "0 | \n", "
32180 | \n", "VisualEditor: Two replacements within the same... | \n", "Test case:\\n\\n+ 'removin... | \n", "1353134520 | \n", "PHID-USER-fovtl67ew4l4cc3oeypc | \n", "False | \n", "PHID-TASK-guukovmsjsnlpphgujcv | \n", "task_description | \n", "invalid | \n", "Test case:\\n\\n+ 'removin... | \n", "Test case:\\n\\n+ 'removin... | \n", "... | \n", "False | \n", "2012-11-17 06:42:00+00:00 | \n", "Test case:\\n\\n+ 'removin... | \n", "Test case:\\n\\n+ 'removin... | \n", "[(Test, test, compound, case, <generator objec... | \n", "[(Test, test, compound, case, <generator objec... | \n", "0.567509 | \n", "0.448561 | \n", "0.535053 | \n", "4 | \n", "
32181 | \n", "VisualEditor: Two replacements within the same... | \n", "With bug 45061 all change marker code has been... | \n", "1360975473 | \n", "PHID-USER-it53o2f2kyryqyj33uzt | \n", "False | \n", "PHID-TASK-guukovmsjsnlpphgujcv | \n", "task_subcomment | \n", "NaN | \n", "With bug 45061 all change marker code has been... | \n", "With bug 45061 all change marker code has been... | \n", "... | \n", "False | \n", "2013-02-16 00:44:33+00:00 | \n", "With bug 45061 all change marker code has been... | \n", "With bug 45061 all change marker code has been... | \n", "[(With, with, prep, change, <generator object ... | \n", "[(With, with, prep, change, <generator object ... | \n", "0.530429 | \n", "0.412000 | \n", "0.509571 | \n", "0 | \n", "
8804 rows × 23 columns
\n", "