{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ba9e5acd-e17d-4318-9272-04c9f6706186", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import spacy" ] }, { "cell_type": "code", "execution_count": 50, "id": "e4f0b3f0-5255-46f1-822f-e455087ba315", "metadata": {}, "outputs": [], "source": [ "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n", "phab_df = pd.read_csv(phab_path)" ] }, { "cell_type": "code", "execution_count": 51, "id": "d449164e-1d28-4580-9eb1-f0f69978f114", "metadata": {}, "outputs": [], "source": [ "#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n", "phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n", "#cleaning df\n", "phab_df['id'] = phab_df.index + 1\n", "#may have to build out the reply_to column \n", "phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n", "phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n", "\n", "phab_df = phab_df.rename(columns={\n", " 'comment_text': 'text',\n", " 'AuthorPHID': 'speaker',\n", " 'TaskPHID': 'conversation_id',\n", " 'WMFaffil':'meta.affil',\n", " 'isGerrit': 'meta.gerrit'\n", "})\n", "\n", "#look for set of prior commenters and then add onto new set each time, just new or not\n", "\n", "# after 11-1-2012 before 11-1-2013\n", "phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n", "filtered_phab_df = phab_df[(phab_df['date_created'] < 1383264000) & (phab_df['date_created'] > 1351728000)]\n", "\n", "#removing headless conversations\n", "task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n", "headed_task_phids = task_phab_df['conversation_id'].unique()\n", "filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n", "\n", "#removing gerrit comments \n", "comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]" ] }, { "cell_type": "code", "execution_count": 52, "id": "942344db-c8f5-4ed6-a757-c97f8454f18b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique conversation_ids: 2081\n", "Unique ids: 8804\n", "Unique speakers: 230\n" ] } ], "source": [ "unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n", "unique_ids = len(comment_phab_df['id'].unique())\n", "unique_speakers = len(comment_phab_df['speaker'].unique())\n", "\n", "print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n", "print(f\"Unique ids: {unique_ids}\")\n", "print(f\"Unique speakers: {unique_speakers}\")" ] }, { "cell_type": "code", "execution_count": 53, "id": "d226d781-b002-4842-a3ae-92d4851a5878", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def preprocess_text(text):\n", " text = str(text)\n", " text = text.replace('*', ' ')\n", " text = text.replace('-', ' ')\n", " text = re.sub(r'http\\S+', '', text)\n", " return text" ] }, { "cell_type": "code", "execution_count": 54, "id": "3ae40d24-bbe8-49c3-a3a9-70bde1b4d559", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_82094/3649688126.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " comment_phab_df['processed_text'] = comment_phab_df['text'].apply(preprocess_text)\n" ] } ], "source": [ "comment_phab_df['processed_text'] = comment_phab_df['text'].apply(preprocess_text)" ] }, { "cell_type": "code", "execution_count": null, "id": "b8eddf40-1fe2-4fce-be74-b32552b40c57", "metadata": {}, "outputs": [], "source": [ "# TODO: add coreference resolution here, possibly... " ] }, { "cell_type": "code", "execution_count": 55, "id": "a8469b16-4ae6-4b06-bf1b-1f2f6c736cab", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")\n", "\n", "def extract_dependency_tree(sentence):\n", " doc = nlp(sentence)\n", " return [(token.text, token.lemma_, token.dep_, token.head.text, token.ancestors, token.subtree, token.children) for token in doc]" ] }, { "cell_type": "code", "execution_count": 56, "id": "8b9a12f9-71bf-4bc9-bcfd-c73aab4be920", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_82094/2805711855.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)\n" ] } ], "source": [ "comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)" ] }, { "cell_type": "code", "execution_count": 57, "id": "a3f5d40b-f56e-4e31-a7f9-40b7ddb4d2a4", "metadata": {}, "outputs": [], "source": [ "#get VAD scores\n", "import numpy as np\n", "#https://saifmohammad.com/WebPages/nrc-vad.html\n", "column_headings = ['Word', 'Valence', 'Arousal', 'Domination']\n", "vad_lexicon = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\\t', header=None, names=column_headings)\n", "vad_dict = vad_lexicon.set_index('Word').T.to_dict()\n", "\n", "def vad_scoring(dependency_tree):\n", " valence = []\n", " arousal = []\n", " dominance = []\n", " for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n", " if lemma in vad_dict:\n", " valence.append(vad_dict[lemma]['Valence'])\n", " arousal.append(vad_dict[lemma]['Arousal'])\n", " dominance.append(vad_dict[lemma]['Domination'])\n", "\n", " # Compute average scores across the comment\n", " avg_valence = np.mean(valence) if valence else 0\n", " avg_arousal = np.mean(arousal) if arousal else 0\n", " avg_dominance = np.mean(dominance) if dominance else 0\n", "\n", " return [avg_valence, avg_arousal, avg_dominance]" ] }, { "cell_type": "code", "execution_count": 58, "id": "828fb57a-e152-42ef-9c60-660648898532", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_82094/4260833843.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " comment_phab_df['avg_vad_scores'] = comment_phab_df['dependency_tree'].apply(vad_scoring)\n" ] } ], "source": [ "#establishing per-comment VAD scores \n", "comment_phab_df['avg_vad_scores'] = comment_phab_df['dependency_tree'].apply(vad_scoring)" ] }, { "cell_type": "code", "execution_count": 68, "id": "27e47f6f-0257-4b70-b222-e91ef888c900", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_82094/3688984161.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " comment_phab_df[['average_v_score', 'average_a_score', 'average_d_score']] = pd.DataFrame(comment_phab_df['avg_vad_scores'].tolist(), index=comment_phab_df.index)\n" ] }, { "data": { "text/html": [ "
\n", " | task_title | \n", "text | \n", "date_created | \n", "speaker | \n", "meta.affil | \n", "conversation_id | \n", "comment_type | \n", "status | \n", "meta.gerrit | \n", "id | \n", "reply_to | \n", "timestamp | \n", "processed_text | \n", "dependency_tree | \n", "average_v_score | \n", "average_a_score | \n", "average_d_score | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
708 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Tested on both the Italian and the English Wik... | \n", "1380976920 | \n", "PHID-USER-wil4b5lylrvf3krixlkl | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_description | \n", "resolved | \n", "False | \n", "709 | \n", "NaN | \n", "2013-10-05 12:42:00+00:00 | \n", "Tested on both the Italian and the English Wik... | \n", "[(Tested, test, advcl, Reach, <generator objec... | \n", "0.575304 | \n", "0.397913 | \n", "0.475913 | \n", "
709 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Note that this is fixed and has been deployed ... | \n", "1381281033 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "710 | \n", "709.0 | \n", "2013-10-09 01:10:33+00:00 | \n", "Note that this is fixed and has been deployed ... | \n", "[(Note, note, ROOT, Note, <generator object at... | \n", "0.623100 | \n", "0.422900 | \n", "0.543500 | \n", "
712 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "*** Bug 55362 has been marked as a duplicate o... | \n", "1381267451 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "713 | \n", "712.0 | \n", "2013-10-08 21:24:11+00:00 | \n", "Bug 55362 has been marked as a duplicate o... | \n", "[( , , dep, Bug, <generator object at 0... | \n", "0.501833 | \n", "0.391667 | \n", "0.429500 | \n", "
717 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "1381168024 | \n", "PHID-USER-sai77mtxmpqnm6pycyvz | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "718 | \n", "717.0 | \n", "2013-10-07 17:47:04+00:00 | \n", "(In reply to comment #6)\\n> Krinkle, do I need... | \n", "[((, (, punct, comment, <generator object at 0... | \n", "0.569450 | \n", "0.405600 | \n", "0.437650 | \n", "
718 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Krinkle, do I need to file a different bug for... | \n", "1381142922 | \n", "PHID-USER-wil4b5lylrvf3krixlkl | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "719 | \n", "718.0 | \n", "2013-10-07 10:48:42+00:00 | \n", "Krinkle, do I need to file a different bug for... | \n", "[(Krinkle, Krinkle, npadvmod, need, <generator... | \n", "0.614556 | \n", "0.432444 | \n", "0.437667 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
32172 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "Puppet config for wikibugs:\\n\\nhttps://gerrit.... | \n", "1354738560 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_description | \n", "resolved | \n", "False | \n", "32173 | \n", "NaN | \n", "2012-12-05 20:16:00+00:00 | \n", "Puppet config for wikibugs:\\n\\n\\n\\nPuppet conf... | \n", "[(Puppet, puppet, compound, config, <generator... | \n", "0.525333 | \n", "0.429333 | \n", "0.401333 | \n", "
32178 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "gerrit-wm is done, but wikibugs is \"an almight... | \n", "1360206228 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "32179 | \n", "32178.0 | \n", "2013-02-07 03:03:48+00:00 | \n", "gerrit wm is done, but wikibugs is \"an almight... | \n", "[(gerrit, gerrit, compound, wm, <generator obj... | \n", "0.595818 | \n", "0.512091 | \n", "0.566273 | \n", "
32179 | \n", "Setup wikibugs and gerrit-wm for #mediawiki-vi... | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "1354926921 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-ciosa56mnibqn4lx27ub | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "32180 | \n", "32179.0 | \n", "2012-12-08 00:35:21+00:00 | \n", "Attempted fixes in Gerrit 37566 and Gerrit 37570. | \n", "[(Attempted, attempt, amod, fixes, <generator ... | \n", "0.692500 | \n", "0.514500 | \n", "0.475000 | \n", "
32180 | \n", "VisualEditor: Two replacements within the same... | \n", "Test case:\\n\\n+ 'removin... | \n", "1353134520 | \n", "PHID-USER-fovtl67ew4l4cc3oeypc | \n", "False | \n", "PHID-TASK-guukovmsjsnlpphgujcv | \n", "task_description | \n", "invalid | \n", "False | \n", "32181 | \n", "NaN | \n", "2012-11-17 06:42:00+00:00 | \n", "Test case:\\n\\n+ 'removin... | \n", "[(Test, test, compound, case, <generator objec... | \n", "0.567509 | \n", "0.448561 | \n", "0.535053 | \n", "
32181 | \n", "VisualEditor: Two replacements within the same... | \n", "With bug 45061 all change marker code has been... | \n", "1360975473 | \n", "PHID-USER-it53o2f2kyryqyj33uzt | \n", "False | \n", "PHID-TASK-guukovmsjsnlpphgujcv | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "32182 | \n", "32181.0 | \n", "2013-02-16 00:44:33+00:00 | \n", "With bug 45061 all change marker code has been... | \n", "[(With, with, prep, change, <generator object ... | \n", "0.530429 | \n", "0.412000 | \n", "0.509571 | \n", "
8804 rows × 17 columns
\n", "