{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ba9e5acd-e17d-4318-9272-04c9f6706186", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import spacy" ] }, { "cell_type": "code", "execution_count": 13, "id": "e4f0b3f0-5255-46f1-822f-e455087ba315", "metadata": {}, "outputs": [], "source": [ "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0312_resolved_ve_phab_comments.csv\"\n", "phab_df = pd.read_csv(phab_path)" ] }, { "cell_type": "code", "execution_count": 14, "id": "d449164e-1d28-4580-9eb1-f0f69978f114", "metadata": {}, "outputs": [], "source": [ "#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n", "phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n", "#cleaning df\n", "phab_df['id'] = phab_df.index + 1\n", "#may have to build out the reply_to column \n", "phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n", "phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n", "\n", "phab_df = phab_df.rename(columns={\n", " 'AuthorPHID': 'speaker',\n", " 'TaskPHID': 'conversation_id',\n", " 'WMFaffil':'meta.affil',\n", " 'isGerrit': 'meta.gerrit'\n", "})\n", "\n", "# after 11-1-2012 before 11-1-2013\n", "phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n", "filtered_phab_df = phab_df[(phab_df['date_created'] < 1383264000) & (phab_df['date_created'] > 1351728000)]\n", "\n", "#removing headless conversations\n", "task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n", "headed_task_phids = task_phab_df['conversation_id'].unique()\n", "filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n", "\n", "#removing gerrit comments \n", "comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]" ] }, { "cell_type": "code", "execution_count": 15, "id": "942344db-c8f5-4ed6-a757-c97f8454f18b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique conversation_ids: 2081\n", "Unique ids: 8804\n", "Unique speakers: 230\n" ] } ], "source": [ "unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n", "unique_ids = len(comment_phab_df['id'].unique())\n", "unique_speakers = len(comment_phab_df['speaker'].unique())\n", "\n", "print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n", "print(f\"Unique ids: {unique_ids}\")\n", "print(f\"Unique speakers: {unique_speakers}\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "c0aade6b-f425-4f9b-ae2a-721ea49712ee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
task_titlecomment_textdate_createdspeakermeta.affilconversation_idcomment_typestatustextresolved_textmeta.gerritidreply_tofirst_commenttimestamp
708VisualEditor: [Regression] \"More\" menu gets sh...Tested on both the Italian and the English Wik...1380976920PHID-USER-wil4b5lylrvf3krixlklTruePHID-TASK-64s56xzrc22ustp2z7wxtask_descriptionresolvedTested on both the Italian and the English Wik...Tested on both the Italian and the English Wik...False709NaNFalse2013-10-05 12:42:00+00:00
709VisualEditor: [Regression] \"More\" menu gets sh...Note that this is fixed and has been deployed ...1381281033PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-64s56xzrc22ustp2z7wxtask_subcommentNaNNote that this is fixed and has been deployed ...Note that this is fixed and has been deployed ...False710709.0False2013-10-09 01:10:33+00:00
712VisualEditor: [Regression] \"More\" menu gets sh...*** Bug 55362 has been marked as a duplicate o...1381267451PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-64s56xzrc22ustp2z7wxtask_subcommentNaN*** Bug 55362 has been marked as a duplicate o...*** Bug 55362 has been marked as a duplicate o...False713712.0False2013-10-08 21:24:11+00:00
717VisualEditor: [Regression] \"More\" menu gets sh...(In reply to comment #6)\\n> Krinkle, do I need...1381168024PHID-USER-sai77mtxmpqnm6pycyvzTruePHID-TASK-64s56xzrc22ustp2z7wxtask_subcommentNaN(In reply to comment #6)\\n> Krinkle, do I need...(In reply to comment #6)\\n> Krinkle, do I need...False718717.0False2013-10-07 17:47:04+00:00
718VisualEditor: [Regression] \"More\" menu gets sh...Krinkle, do I need to file a different bug for...1381142922PHID-USER-wil4b5lylrvf3krixlklTruePHID-TASK-64s56xzrc22ustp2z7wxtask_subcommentNaNKrinkle, do I need to file a different bug for...Krinkle, do Krinkle need to file a different b...False719718.0False2013-10-07 10:48:42+00:00
................................................
32172Setup wikibugs and gerrit-wm for #mediawiki-vi...Puppet config for wikibugs:\\n\\nhttps://gerrit....1354738560PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-ciosa56mnibqn4lx27ubtask_descriptionresolvedPuppet config for wikibugs:\\n\\nhttps://gerrit....Puppet config for wikibugs:\\n\\nhttps://gerrit....False32173NaNFalse2012-12-05 20:16:00+00:00
32178Setup wikibugs and gerrit-wm for #mediawiki-vi...gerrit-wm is done, but wikibugs is \"an almight...1360206228PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-ciosa56mnibqn4lx27ubtask_subcommentNaNgerrit-wm is done, but wikibugs is \"an almight...gerrit-wm is done, but wikibugs is \"an almight...False3217932178.0False2013-02-07 03:03:48+00:00
32179Setup wikibugs and gerrit-wm for #mediawiki-vi...Attempted fixes in Gerrit 37566 and Gerrit 37570.1354926921PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-ciosa56mnibqn4lx27ubtask_subcommentNaNAttempted fixes in Gerrit 37566 and Gerrit 37570.Attempted fixes in Gerrit 37566 and Gerrit 37570.False3218032179.0False2012-12-08 00:35:21+00:00
32180VisualEditor: Two replacements within the same...Test case:\\n\\n+ 'removin...1353134520PHID-USER-fovtl67ew4l4cc3oeypcFalsePHID-TASK-guukovmsjsnlpphgujcvtask_descriptioninvalidTest case:\\n\\n+ 'removin...Test case:\\n\\n+ 'removin...False32181NaNFalse2012-11-17 06:42:00+00:00
32181VisualEditor: Two replacements within the same...With bug 45061 all change marker code has been...1360975473PHID-USER-it53o2f2kyryqyj33uztFalsePHID-TASK-guukovmsjsnlpphgujcvtask_subcommentNaNWith bug 45061 all change marker code has been...With bug 45061 all change marker code has been...False3218232181.0False2013-02-16 00:44:33+00:00
\n", "

8804 rows × 15 columns

\n", "
" ], "text/plain": [ " task_title \\\n", "708 VisualEditor: [Regression] \"More\" menu gets sh... \n", "709 VisualEditor: [Regression] \"More\" menu gets sh... \n", "712 VisualEditor: [Regression] \"More\" menu gets sh... \n", "717 VisualEditor: [Regression] \"More\" menu gets sh... \n", "718 VisualEditor: [Regression] \"More\" menu gets sh... \n", "... ... \n", "32172 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n", "32178 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n", "32179 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n", "32180 VisualEditor: Two replacements within the same... \n", "32181 VisualEditor: Two replacements within the same... \n", "\n", " comment_text date_created \\\n", "708 Tested on both the Italian and the English Wik... 1380976920 \n", "709 Note that this is fixed and has been deployed ... 1381281033 \n", "712 *** Bug 55362 has been marked as a duplicate o... 1381267451 \n", "717 (In reply to comment #6)\\n> Krinkle, do I need... 1381168024 \n", "718 Krinkle, do I need to file a different bug for... 1381142922 \n", "... ... ... \n", "32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... 1354738560 \n", "32178 gerrit-wm is done, but wikibugs is \"an almight... 1360206228 \n", "32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. 1354926921 \n", "32180 Test case:\\n\\n+ 'removin... 1353134520 \n", "32181 With bug 45061 all change marker code has been... 1360975473 \n", "\n", " speaker meta.affil \\\n", "708 PHID-USER-wil4b5lylrvf3krixlkl True \n", "709 PHID-USER-ydswvwhh5pm4lshahjje True \n", "712 PHID-USER-ydswvwhh5pm4lshahjje True \n", "717 PHID-USER-sai77mtxmpqnm6pycyvz True \n", "718 PHID-USER-wil4b5lylrvf3krixlkl True \n", "... ... ... \n", "32172 PHID-USER-ydswvwhh5pm4lshahjje True \n", "32178 PHID-USER-ydswvwhh5pm4lshahjje True \n", "32179 PHID-USER-ydswvwhh5pm4lshahjje True \n", "32180 PHID-USER-fovtl67ew4l4cc3oeypc False \n", "32181 PHID-USER-it53o2f2kyryqyj33uzt False \n", "\n", " conversation_id comment_type status \\\n", "708 PHID-TASK-64s56xzrc22ustp2z7wx task_description resolved \n", "709 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n", "712 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n", "717 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n", "718 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n", "... ... ... ... \n", "32172 PHID-TASK-ciosa56mnibqn4lx27ub task_description resolved \n", "32178 PHID-TASK-ciosa56mnibqn4lx27ub task_subcomment NaN \n", "32179 PHID-TASK-ciosa56mnibqn4lx27ub task_subcomment NaN \n", "32180 PHID-TASK-guukovmsjsnlpphgujcv task_description invalid \n", "32181 PHID-TASK-guukovmsjsnlpphgujcv task_subcomment NaN \n", "\n", " text \\\n", "708 Tested on both the Italian and the English Wik... \n", "709 Note that this is fixed and has been deployed ... \n", "712 *** Bug 55362 has been marked as a duplicate o... \n", "717 (In reply to comment #6)\\n> Krinkle, do I need... \n", "718 Krinkle, do I need to file a different bug for... \n", "... ... \n", "32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... \n", "32178 gerrit-wm is done, but wikibugs is \"an almight... \n", "32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. \n", "32180 Test case:\\n\\n+ 'removin... \n", "32181 With bug 45061 all change marker code has been... \n", "\n", " resolved_text meta.gerrit id \\\n", "708 Tested on both the Italian and the English Wik... False 709 \n", "709 Note that this is fixed and has been deployed ... False 710 \n", "712 *** Bug 55362 has been marked as a duplicate o... False 713 \n", "717 (In reply to comment #6)\\n> Krinkle, do I need... False 718 \n", "718 Krinkle, do Krinkle need to file a different b... False 719 \n", "... ... ... ... \n", "32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... False 32173 \n", "32178 gerrit-wm is done, but wikibugs is \"an almight... False 32179 \n", "32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. False 32180 \n", "32180 Test case:\\n\\n+ 'removin... False 32181 \n", "32181 With bug 45061 all change marker code has been... False 32182 \n", "\n", " reply_to first_comment timestamp \n", "708 NaN False 2013-10-05 12:42:00+00:00 \n", "709 709.0 False 2013-10-09 01:10:33+00:00 \n", "712 712.0 False 2013-10-08 21:24:11+00:00 \n", "717 717.0 False 2013-10-07 17:47:04+00:00 \n", "718 718.0 False 2013-10-07 10:48:42+00:00 \n", "... ... ... ... \n", "32172 NaN False 2012-12-05 20:16:00+00:00 \n", "32178 32178.0 False 2013-02-07 03:03:48+00:00 \n", "32179 32179.0 False 2012-12-08 00:35:21+00:00 \n", "32180 NaN False 2012-11-17 06:42:00+00:00 \n", "32181 32181.0 False 2013-02-16 00:44:33+00:00 \n", "\n", "[8804 rows x 15 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "comment_phab_df" ] }, { "cell_type": "code", "execution_count": 17, "id": "d226d781-b002-4842-a3ae-92d4851a5878", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def preprocess_text(text):\n", " text = str(text)\n", " text = text.replace('*', ' ')\n", " text = text.replace('-', ' ')\n", " text = re.sub(r'http\\S+', '', text)\n", " return text" ] }, { "cell_type": "code", "execution_count": 18, "id": "3ae40d24-bbe8-49c3-a3a9-70bde1b4d559", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_49967/3649688126.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " comment_phab_df['processed_text'] = comment_phab_df['text'].apply(preprocess_text)\n" ] } ], "source": [ "comment_phab_df['processed_text'] = comment_phab_df['text'].apply(preprocess_text)" ] }, { "cell_type": "code", "execution_count": 19, "id": "b8eddf40-1fe2-4fce-be74-b32552b40c57", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_49967/1316816771.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " comment_phab_df['processed_resolved_text'] = comment_phab_df['resolved_text'].apply(preprocess_text)\n" ] } ], "source": [ "comment_phab_df['processed_resolved_text'] = comment_phab_df['resolved_text'].apply(preprocess_text)" ] }, { "cell_type": "code", "execution_count": 20, "id": "a8469b16-4ae6-4b06-bf1b-1f2f6c736cab", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")\n", "\n", "def extract_dependency_tree(sentence):\n", " doc = nlp(sentence)\n", " return [(token.text, token.lemma_, token.dep_, token.head.text, token.ancestors, token.subtree, token.children) for token in doc]" ] }, { "cell_type": "code", "execution_count": 21, "id": "8b9a12f9-71bf-4bc9-bcfd-c73aab4be920", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_49967/2805711855.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)\n" ] } ], "source": [ "comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)" ] }, { "cell_type": "code", "execution_count": 22, "id": "337a528a-5667-4e1f-ac9a-37caabc03a18", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_49967/2117289791.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " comment_phab_df['resolved_dependency_tree'] = comment_phab_df['processed_resolved_text'].apply(extract_dependency_tree)\n" ] } ], "source": [ "comment_phab_df['resolved_dependency_tree'] = comment_phab_df['processed_resolved_text'].apply(extract_dependency_tree)" ] }, { "cell_type": "code", "execution_count": 32, "id": "a3f5d40b-f56e-4e31-a7f9-40b7ddb4d2a4", "metadata": {}, "outputs": [], "source": [ "#get VAD scores\n", "import numpy as np\n", "#https://saifmohammad.com/WebPages/nrc-vad.html\n", "column_headings = ['Word', 'Valence', 'Arousal', 'Domination']\n", "vad_lexicon = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\\t', header=None, names=column_headings)\n", "vad_dict = vad_lexicon.set_index('Word').T.to_dict()\n", "\n", "def vad_scoring(dependency_tree):\n", " valence = []\n", " arousal = []\n", " dominance = []\n", " for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n", " if lemma in vad_dict:\n", " valence.append(vad_dict[lemma]['Valence'])\n", " arousal.append(vad_dict[lemma]['Arousal'])\n", " dominance.append(vad_dict[lemma]['Domination'])\n", "\n", " # Compute average scores across the comment\n", " avg_valence = np.mean(valence) if valence else 0\n", " avg_arousal = np.mean(arousal) if arousal else 0\n", " avg_dominance = np.mean(dominance) if dominance else 0\n", "\n", " return [avg_valence, avg_arousal, avg_dominance]\n", "\n", "def dominance_prevail(dependency_tree):\n", " dominant_words = 0 \n", " for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n", " if lemma in vad_dict:\n", " if vad_dict[lemma]['Domination'] >= 0.75:\n", " dominant_words += 1\n", " return dominant_words\n", " " ] }, { "cell_type": "code", "execution_count": 33, "id": "828fb57a-e152-42ef-9c60-660648898532", "metadata": {}, "outputs": [], "source": [ "#establishing per-comment VAD scores \n", "comment_phab_df['avg_vad_scores'] = comment_phab_df['dependency_tree'].apply(vad_scoring)\n", "comment_phab_df['dominant_wc'] = comment_phab_df['dependency_tree'].apply(dominance_prevail)" ] }, { "cell_type": "code", "execution_count": 34, "id": "27e47f6f-0257-4b70-b222-e91ef888c900", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
task_titlecomment_textdate_createdspeakermeta.affilconversation_idcomment_typestatustextresolved_text...first_commenttimestampprocessed_textprocessed_resolved_textdependency_treeresolved_dependency_treeaverage_v_scoreaverage_a_scoreaverage_d_scoredominant_wc
708VisualEditor: [Regression] \"More\" menu gets sh...Tested on both the Italian and the English Wik...1380976920PHID-USER-wil4b5lylrvf3krixlklTruePHID-TASK-64s56xzrc22ustp2z7wxtask_descriptionresolvedTested on both the Italian and the English Wik...Tested on both the Italian and the English Wik......False2013-10-05 12:42:00+00:00Tested on both the Italian and the English Wik...Tested on both the Italian and the English Wik...[(Tested, test, advcl, Reach, <generator objec...[(Tested, test, advcl, Reach, <generator objec...0.5753040.3979130.4759132
709VisualEditor: [Regression] \"More\" menu gets sh...Note that this is fixed and has been deployed ...1381281033PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-64s56xzrc22ustp2z7wxtask_subcommentNaNNote that this is fixed and has been deployed ...Note that this is fixed and has been deployed ......False2013-10-09 01:10:33+00:00Note that this is fixed and has been deployed ...Note that this is fixed and has been deployed ...[(Note, note, ROOT, Note, <generator object at...[(Note, note, ROOT, Note, <generator object at...0.6231000.4229000.5435000
712VisualEditor: [Regression] \"More\" menu gets sh...*** Bug 55362 has been marked as a duplicate o...1381267451PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-64s56xzrc22ustp2z7wxtask_subcommentNaN*** Bug 55362 has been marked as a duplicate o...*** Bug 55362 has been marked as a duplicate o......False2013-10-08 21:24:11+00:00Bug 55362 has been marked as a duplicate o...Bug 55362 has been marked as a duplicate o...[( , , dep, Bug, <generator object at 0...[( , , dep, Bug, <generator object at 0...0.5018330.3916670.4295000
717VisualEditor: [Regression] \"More\" menu gets sh...(In reply to comment #6)\\n> Krinkle, do I need...1381168024PHID-USER-sai77mtxmpqnm6pycyvzTruePHID-TASK-64s56xzrc22ustp2z7wxtask_subcommentNaN(In reply to comment #6)\\n> Krinkle, do I need...(In reply to comment #6)\\n> Krinkle, do I need......False2013-10-07 17:47:04+00:00(In reply to comment #6)\\n> Krinkle, do I need...(In reply to comment #6)\\n> Krinkle, do I need...[((, (, punct, comment, <generator object at 0...[((, (, punct, comment, <generator object at 0...0.5694500.4056000.4376501
718VisualEditor: [Regression] \"More\" menu gets sh...Krinkle, do I need to file a different bug for...1381142922PHID-USER-wil4b5lylrvf3krixlklTruePHID-TASK-64s56xzrc22ustp2z7wxtask_subcommentNaNKrinkle, do I need to file a different bug for...Krinkle, do Krinkle need to file a different b......False2013-10-07 10:48:42+00:00Krinkle, do I need to file a different bug for...Krinkle, do Krinkle need to file a different b...[(Krinkle, Krinkle, npadvmod, need, <generator...[(Krinkle, Krinkle, npadvmod, need, <generator...0.6145560.4324440.4376671
..................................................................
32172Setup wikibugs and gerrit-wm for #mediawiki-vi...Puppet config for wikibugs:\\n\\nhttps://gerrit....1354738560PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-ciosa56mnibqn4lx27ubtask_descriptionresolvedPuppet config for wikibugs:\\n\\nhttps://gerrit....Puppet config for wikibugs:\\n\\nhttps://gerrit.......False2012-12-05 20:16:00+00:00Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf...Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf...[(Puppet, puppet, compound, config, <generator...[(Puppet, puppet, compound, config, <generator...0.5253330.4293330.4013330
32178Setup wikibugs and gerrit-wm for #mediawiki-vi...gerrit-wm is done, but wikibugs is \"an almight...1360206228PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-ciosa56mnibqn4lx27ubtask_subcommentNaNgerrit-wm is done, but wikibugs is \"an almight...gerrit-wm is done, but wikibugs is \"an almight......False2013-02-07 03:03:48+00:00gerrit wm is done, but wikibugs is \"an almight...gerrit wm is done, but wikibugs is \"an almight...[(gerrit, gerrit, compound, wm, <generator obj...[(gerrit, gerrit, compound, wm, <generator obj...0.5958180.5120910.5662733
32179Setup wikibugs and gerrit-wm for #mediawiki-vi...Attempted fixes in Gerrit 37566 and Gerrit 37570.1354926921PHID-USER-ydswvwhh5pm4lshahjjeTruePHID-TASK-ciosa56mnibqn4lx27ubtask_subcommentNaNAttempted fixes in Gerrit 37566 and Gerrit 37570.Attempted fixes in Gerrit 37566 and Gerrit 37570....False2012-12-08 00:35:21+00:00Attempted fixes in Gerrit 37566 and Gerrit 37570.Attempted fixes in Gerrit 37566 and Gerrit 37570.[(Attempted, attempt, amod, fixes, <generator ...[(Attempted, attempt, amod, fixes, <generator ...0.6925000.5145000.4750000
32180VisualEditor: Two replacements within the same...Test case:\\n\\n+ 'removin...1353134520PHID-USER-fovtl67ew4l4cc3oeypcFalsePHID-TASK-guukovmsjsnlpphgujcvtask_descriptioninvalidTest case:\\n\\n+ 'removin...Test case:\\n\\n+ 'removin......False2012-11-17 06:42:00+00:00Test case:\\n\\n+ 'removin...Test case:\\n\\n+ 'removin...[(Test, test, compound, case, <generator objec...[(Test, test, compound, case, <generator objec...0.5675090.4485610.5350534
32181VisualEditor: Two replacements within the same...With bug 45061 all change marker code has been...1360975473PHID-USER-it53o2f2kyryqyj33uztFalsePHID-TASK-guukovmsjsnlpphgujcvtask_subcommentNaNWith bug 45061 all change marker code has been...With bug 45061 all change marker code has been......False2013-02-16 00:44:33+00:00With bug 45061 all change marker code has been...With bug 45061 all change marker code has been...[(With, with, prep, change, <generator object ...[(With, with, prep, change, <generator object ...0.5304290.4120000.5095710
\n", "

8804 rows × 23 columns

\n", "
" ], "text/plain": [ " task_title \\\n", "708 VisualEditor: [Regression] \"More\" menu gets sh... \n", "709 VisualEditor: [Regression] \"More\" menu gets sh... \n", "712 VisualEditor: [Regression] \"More\" menu gets sh... \n", "717 VisualEditor: [Regression] \"More\" menu gets sh... \n", "718 VisualEditor: [Regression] \"More\" menu gets sh... \n", "... ... \n", "32172 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n", "32178 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n", "32179 Setup wikibugs and gerrit-wm for #mediawiki-vi... \n", "32180 VisualEditor: Two replacements within the same... \n", "32181 VisualEditor: Two replacements within the same... \n", "\n", " comment_text date_created \\\n", "708 Tested on both the Italian and the English Wik... 1380976920 \n", "709 Note that this is fixed and has been deployed ... 1381281033 \n", "712 *** Bug 55362 has been marked as a duplicate o... 1381267451 \n", "717 (In reply to comment #6)\\n> Krinkle, do I need... 1381168024 \n", "718 Krinkle, do I need to file a different bug for... 1381142922 \n", "... ... ... \n", "32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... 1354738560 \n", "32178 gerrit-wm is done, but wikibugs is \"an almight... 1360206228 \n", "32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. 1354926921 \n", "32180 Test case:\\n\\n+ 'removin... 1353134520 \n", "32181 With bug 45061 all change marker code has been... 1360975473 \n", "\n", " speaker meta.affil \\\n", "708 PHID-USER-wil4b5lylrvf3krixlkl True \n", "709 PHID-USER-ydswvwhh5pm4lshahjje True \n", "712 PHID-USER-ydswvwhh5pm4lshahjje True \n", "717 PHID-USER-sai77mtxmpqnm6pycyvz True \n", "718 PHID-USER-wil4b5lylrvf3krixlkl True \n", "... ... ... \n", "32172 PHID-USER-ydswvwhh5pm4lshahjje True \n", "32178 PHID-USER-ydswvwhh5pm4lshahjje True \n", "32179 PHID-USER-ydswvwhh5pm4lshahjje True \n", "32180 PHID-USER-fovtl67ew4l4cc3oeypc False \n", "32181 PHID-USER-it53o2f2kyryqyj33uzt False \n", "\n", " conversation_id comment_type status \\\n", "708 PHID-TASK-64s56xzrc22ustp2z7wx task_description resolved \n", "709 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n", "712 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n", "717 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n", "718 PHID-TASK-64s56xzrc22ustp2z7wx task_subcomment NaN \n", "... ... ... ... \n", "32172 PHID-TASK-ciosa56mnibqn4lx27ub task_description resolved \n", "32178 PHID-TASK-ciosa56mnibqn4lx27ub task_subcomment NaN \n", "32179 PHID-TASK-ciosa56mnibqn4lx27ub task_subcomment NaN \n", "32180 PHID-TASK-guukovmsjsnlpphgujcv task_description invalid \n", "32181 PHID-TASK-guukovmsjsnlpphgujcv task_subcomment NaN \n", "\n", " text \\\n", "708 Tested on both the Italian and the English Wik... \n", "709 Note that this is fixed and has been deployed ... \n", "712 *** Bug 55362 has been marked as a duplicate o... \n", "717 (In reply to comment #6)\\n> Krinkle, do I need... \n", "718 Krinkle, do I need to file a different bug for... \n", "... ... \n", "32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... \n", "32178 gerrit-wm is done, but wikibugs is \"an almight... \n", "32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. \n", "32180 Test case:\\n\\n+ 'removin... \n", "32181 With bug 45061 all change marker code has been... \n", "\n", " resolved_text ... first_comment \\\n", "708 Tested on both the Italian and the English Wik... ... False \n", "709 Note that this is fixed and has been deployed ... ... False \n", "712 *** Bug 55362 has been marked as a duplicate o... ... False \n", "717 (In reply to comment #6)\\n> Krinkle, do I need... ... False \n", "718 Krinkle, do Krinkle need to file a different b... ... False \n", "... ... ... ... \n", "32172 Puppet config for wikibugs:\\n\\nhttps://gerrit.... ... False \n", "32178 gerrit-wm is done, but wikibugs is \"an almight... ... False \n", "32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. ... False \n", "32180 Test case:\\n\\n+ 'removin... ... False \n", "32181 With bug 45061 all change marker code has been... ... False \n", "\n", " timestamp \\\n", "708 2013-10-05 12:42:00+00:00 \n", "709 2013-10-09 01:10:33+00:00 \n", "712 2013-10-08 21:24:11+00:00 \n", "717 2013-10-07 17:47:04+00:00 \n", "718 2013-10-07 10:48:42+00:00 \n", "... ... \n", "32172 2012-12-05 20:16:00+00:00 \n", "32178 2013-02-07 03:03:48+00:00 \n", "32179 2012-12-08 00:35:21+00:00 \n", "32180 2012-11-17 06:42:00+00:00 \n", "32181 2013-02-16 00:44:33+00:00 \n", "\n", " processed_text \\\n", "708 Tested on both the Italian and the English Wik... \n", "709 Note that this is fixed and has been deployed ... \n", "712 Bug 55362 has been marked as a duplicate o... \n", "717 (In reply to comment #6)\\n> Krinkle, do I need... \n", "718 Krinkle, do I need to file a different bug for... \n", "... ... \n", "32172 Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf... \n", "32178 gerrit wm is done, but wikibugs is \"an almight... \n", "32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. \n", "32180 Test case:\\n\\n+ 'removin... \n", "32181 With bug 45061 all change marker code has been... \n", "\n", " processed_resolved_text \\\n", "708 Tested on both the Italian and the English Wik... \n", "709 Note that this is fixed and has been deployed ... \n", "712 Bug 55362 has been marked as a duplicate o... \n", "717 (In reply to comment #6)\\n> Krinkle, do I need... \n", "718 Krinkle, do Krinkle need to file a different b... \n", "... ... \n", "32172 Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf... \n", "32178 gerrit wm is done, but wikibugs is \"an almight... \n", "32179 Attempted fixes in Gerrit 37566 and Gerrit 37570. \n", "32180 Test case:\\n\\n+ 'removin... \n", "32181 With bug 45061 all change marker code has been... \n", "\n", " dependency_tree \\\n", "708 [(Tested, test, advcl, Reach, " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(10, 6))\n", "task_phab_df = phab_df[phab_df['comment_type']==\"task_description\"]\n", "task_phab_df['first_comment'] = task_phab_df.groupby('speaker')['timestamp'].rank(method='first') == 1\n", "task_phab_df = task_phab_df[(task_phab_df['date_created'] < 1383264000) & (task_phab_df['date_created'] > 1351728000)]\n", "\n", "task_phab_df['week'] = task_phab_df['timestamp'].dt.to_period('W').dt.start_time\n", "unique_taskPHIDs = task_phab_df.groupby('week')['conversation_id'].nunique()\n", "\n", "wmf_task_phab_df = task_phab_df[task_phab_df['meta.affil'] == True]\n", "wmf_tasks = wmf_task_phab_df.groupby('week')['conversation_id'].nunique()\n", "\n", "other_task_phab_df = task_phab_df[task_phab_df['meta.affil'] != True]\n", "other_tasks = other_task_phab_df.groupby('week')['conversation_id'].nunique()\n", "\n", "new_tasks_phab_df = task_phab_df[task_phab_df['first_comment'] == True]\n", "new_tasks = new_tasks_phab_df.groupby('week')['conversation_id'].nunique()\n", "\n", "sns.lineplot(x=unique_taskPHIDs.index, y=unique_taskPHIDs.values, color='black', label='Total', marker='o')\n", "sns.lineplot(x=wmf_tasks.index, y=wmf_tasks.values, color='#c7756a', label='WMF-affiliated authors', marker='o')\n", "sns.lineplot(x=other_tasks.index, y=other_tasks.values, color='#5da2d8', label='Nonaffiliated authors', marker='o')\n", "sns.lineplot(x=new_tasks.index, y=new_tasks.values, color=\"green\", label=\"first-timers\", marker='o')\n", "\n", "plt.title('New Phabricator Tasks Indexed with \"VisualEditor\"')\n", "plt.xlabel('Timestamp')\n", "plt.ylabel('Unique taskPHIDs')\n", "plt.xticks(rotation=45)\n", "plt.grid(True)\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 50, "id": "b7cfad77-d48a-4708-91f3-89ae1179b90c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "comment_phab_df['before_after'] = comment_phab_df['timestamp'] > pd.Timestamp('2013-07-01 00:00:01+00:00')\n", "plt.figure(figsize=(10, 6))\n", "\n", "sns.lmplot(data=comment_phab_df, x=\"date_created\", y=\"dominant_wc\", hue=\"before_after\", col=\"meta.affil\", scatter=False)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 53, "id": "d2d67d38-f005-4c94-be3c-39eb6b22686f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_49967/3455565877.py:2: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " filtered_dependencies = dependency_relations_df[dependency_relations_df['token'].str.contains(pattern, regex=True)]\n", "/tmp/ipykernel_49967/3455565877.py:3: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " resolved_filtered_dependencies = resolved_dependency_relations_df[resolved_dependency_relations_df['token'].str.contains(pattern, regex=True)]\n", "/tmp/ipykernel_49967/3455565877.py:18: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n", " filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n", "/tmp/ipykernel_49967/3455565877.py:18: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n", "/tmp/ipykernel_49967/3455565877.py:37: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n", " resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n", "/tmp/ipykernel_49967/3455565877.py:37: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n", "/tmp/ipykernel_49967/3455565877.py:40: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", " resolved_wmf_filtered_dependencies = resolved_filtered_dependencies[filtered_dependencies['wmfAffil'] == True]\n" ] }, { "ename": "IndexingError", "evalue": "Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexingError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[53], line 40\u001b[0m\n\u001b[1;32m 37\u001b[0m resolved_filtered_dependencies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mweek\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m resolved_filtered_dependencies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mdt\u001b[38;5;241m.\u001b[39mto_period(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mW\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mdt\u001b[38;5;241m.\u001b[39mstart_time\n\u001b[1;32m 38\u001b[0m resolved_median_depth \u001b[38;5;241m=\u001b[39m resolved_filtered_dependencies\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mweek\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdepth\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mmedian()\u001b[38;5;241m.\u001b[39mreset_index()\n\u001b[0;32m---> 40\u001b[0m resolved_wmf_filtered_dependencies \u001b[38;5;241m=\u001b[39m \u001b[43mresolved_filtered_dependencies\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfiltered_dependencies\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwmfAffil\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m]\u001b[49m\n\u001b[1;32m 41\u001b[0m resolved_wmf_median_depth \u001b[38;5;241m=\u001b[39m resolved_wmf_filtered_dependencies\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mweek\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdepth\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mmedian()\u001b[38;5;241m.\u001b[39mreset_index()\n\u001b[1;32m 43\u001b[0m resolved_other_filtered_dependencies \u001b[38;5;241m=\u001b[39m resolved_filtered_dependencies[filtered_dependencies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwmfAffil\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m]\n", "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/pandas/core/frame.py:4093\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;66;03m# Do we have a (boolean) 1d indexer?\u001b[39;00m\n\u001b[1;32m 4092\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m com\u001b[38;5;241m.\u001b[39mis_bool_indexer(key):\n\u001b[0;32m-> 4093\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem_bool_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4095\u001b[0m \u001b[38;5;66;03m# We are left with two options: a single key, and a collection of keys,\u001b[39;00m\n\u001b[1;32m 4096\u001b[0m \u001b[38;5;66;03m# We interpret tuples as collections only for non-MultiIndex\u001b[39;00m\n\u001b[1;32m 4097\u001b[0m is_single_key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_list_like(key)\n", "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/pandas/core/frame.py:4149\u001b[0m, in \u001b[0;36mDataFrame._getitem_bool_array\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4143\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 4144\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mItem wrong length \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(key)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m instead of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4145\u001b[0m )\n\u001b[1;32m 4147\u001b[0m \u001b[38;5;66;03m# check_bool_indexer will throw exception if Series key cannot\u001b[39;00m\n\u001b[1;32m 4148\u001b[0m \u001b[38;5;66;03m# be reindexed to match DataFrame rows\u001b[39;00m\n\u001b[0;32m-> 4149\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_bool_indexer\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4151\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mall():\n\u001b[1;32m 4152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcopy(deep\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n", "File \u001b[0;32m/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/pandas/core/indexing.py:2662\u001b[0m, in \u001b[0;36mcheck_bool_indexer\u001b[0;34m(index, key)\u001b[0m\n\u001b[1;32m 2660\u001b[0m indexer \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mget_indexer_for(index)\n\u001b[1;32m 2661\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01min\u001b[39;00m indexer:\n\u001b[0;32m-> 2662\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m IndexingError(\n\u001b[1;32m 2663\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnalignable boolean Series provided as \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2664\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindexer (index of the boolean Series and of \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2665\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe indexed object do not match).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2666\u001b[0m )\n\u001b[1;32m 2668\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 2670\u001b[0m \u001b[38;5;66;03m# fall through for boolean\u001b[39;00m\n", "\u001b[0;31mIndexingError\u001b[0m: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match)." ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pattern = r'\\b(ve|VE|visualeditor|VisualEditor)\\b'\n", "filtered_dependencies = dependency_relations_df[dependency_relations_df['token'].str.contains(pattern, regex=True)]\n", "resolved_filtered_dependencies = resolved_dependency_relations_df[resolved_dependency_relations_df['token'].str.contains(pattern, regex=True)]\n", "\n", "plt.figure(figsize=(12, 8))\n", "gs = GridSpec(2, 1, height_ratios=[6, 6])\n", "\n", "# Main plot: Token depth by timestamp\n", "'''\n", "ax0 = plt.subplot(gs[0])\n", "sns.scatterplot(data=filtered_dependencies, x='timestamp', y='dependency', hue='wmfAffil', style='dependency', markers=True, s=100, ax=ax0)\n", "ax0.set_title('VE Depth by Timestamp w/o URLS')\n", "ax0.set_xlabel('')\n", "ax0.set_ylabel('Dependency Type')\n", "ax0.legend().set_visible(False)\n", "'''\n", "# Calculate the median depth over time\n", "filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n", "median_depth = filtered_dependencies.groupby('week')['depth'].median().reset_index()\n", "\n", "wmf_filtered_dependencies = filtered_dependencies[filtered_dependencies['wmfAffil'] == True]\n", "wmf_median_depth = wmf_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n", "\n", "other_filtered_dependencies = filtered_dependencies[filtered_dependencies['wmfAffil'] != True]\n", "other_median_depth = other_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n", "\n", "# Plot the median depth over time\n", "ax0 = plt.subplot(gs[0])\n", "sns.lineplot(data=median_depth, x='week', y='depth', ax=ax0, color='black', label='Median Depth', marker='o')\n", "sns.lineplot(data=wmf_median_depth, x='week', y='depth', ax=ax0, color='#c7756a', label='WMF-affiliated authors', marker='x')\n", "sns.lineplot(data=other_median_depth, x='week', y='depth', ax=ax0, color='#5da2d8', label='Nonaffiliated authors', marker='x')\n", "ax0.set_title('Median Depth of VE in Phabricator Sentence Dependency Trees')\n", "ax0.set_ylabel('Median Depth')\n", "ax0.set_xlabel('')\n", "\n", "# Calculate the median depth over time\n", "resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n", "resolved_median_depth = resolved_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n", "\n", "resolved_wmf_filtered_dependencies = resolved_filtered_dependencies[filtered_dependencies['wmfAffil'] == True]\n", "resolved_wmf_median_depth = resolved_wmf_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n", "\n", "resolved_other_filtered_dependencies = resolved_filtered_dependencies[filtered_dependencies['wmfAffil'] != True]\n", "resolved_other_median_depth = resolved_other_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n", "\n", "# Plot the median depth over time\n", "ax1 = plt.subplot(gs[1])\n", "sns.lineplot(data=resolved_median_depth, x='week', y='depth', ax=ax1, color='black', label='Median Depth', marker='o')\n", "sns.lineplot(data=resolved_wmf_median_depth, x='week', y='depth', ax=ax1, color='#c7756a', label='WMF-affiliated authors', marker='x')\n", "sns.lineplot(data=resolved_other_median_depth, x='week', y='depth', ax=ax1, color='#5da2d8', label='Nonaffiliated authors', marker='x')\n", "ax1.set_title('Median Depth of VE in Coreference-resolved Phabricator Sentence Dependency Trees')\n", "ax1.set_ylabel('Median Depth')\n", "ax1.set_xlabel('')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 5 }