{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ba9e5acd-e17d-4318-9272-04c9f6706186", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import spacy" ] }, { "cell_type": "code", "execution_count": 2, "id": "e4f0b3f0-5255-46f1-822f-e455087ba315", "metadata": {}, "outputs": [], "source": [ "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref-rel-final.csv\"\n", "phab_df = pd.read_csv(phab_path)" ] }, { "cell_type": "code", "execution_count": 3, "id": "d449164e-1d28-4580-9eb1-f0f69978f114", "metadata": {}, "outputs": [], "source": [ "#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n", "#phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n", "\n", "#cleaning df\n", "#phab_df['id'] = phab_df.index + 1\n", "#may have to build out the reply_to column \n", "#phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n", "#phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n", "\n", "#phab_df = phab_df.rename(columns={\n", "# 'AuthorPHID': 'speaker',\n", "# 'TaskPHID': 'conversation_id',\n", "# 'WMFaffil':'meta.affil',\n", "# 'isGerrit': 'meta.gerrit'\n", "#})\n", "\n", "# after 12-1-2012 before 12-1-2013\n", "#phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n", "#filtered_phab_df = phab_df[(phab_df['date_created'] < 1385856000) & (phab_df['date_created'] > 1354320000)]\n", "#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]\n", "\n", "#removing headless conversations\n", "task_phab_df = phab_df[phab_df['comment_type']==\"task_description\"]\n", "headed_task_phids = task_phab_df['conversation_id'].unique()\n", "filtered_phab_df = phab_df[phab_df['conversation_id'].isin(headed_task_phids)]\n", "\n", "#removing gerrit comments \n", "#mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n", "\n", "'''\n", "# filter out the sourceforge migration \n", "# Originally from: http://sourceforge.net in the task task_summary\n", "migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n", "\n", "#cut down to only the data that is relevant (mentions http)\n", "relevant_conversation_ids = task_phab_df[\n", " task_phab_df['comment_text'].apply(http_relevant) |\n", " task_phab_df['task_title'].apply(http_relevant)\n", "]['conversation_id'].unique()\n", "\n", "task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n", "mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n", "\n", "task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n", "mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n", "'''\n", "#comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n", "#task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n", "comment_phab_df = filtered_phab_df" ] }, { "cell_type": "code", "execution_count": 4, "id": "942344db-c8f5-4ed6-a757-c97f8454f18b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique conversation_ids: 2281\n", "Unique ids: 14490\n", "Unique speakers: 634\n" ] } ], "source": [ "unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n", "unique_ids = len(comment_phab_df['id'].unique())\n", "unique_speakers = len(comment_phab_df['speaker'].unique())\n", "\n", "print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n", "print(f\"Unique ids: {unique_ids}\")\n", "print(f\"Unique speakers: {unique_speakers}\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "d226d781-b002-4842-a3ae-92d4851a5878", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def preprocess_text(text):\n", " text = str(text)\n", " text = text.replace('*', ' ')\n", " text = text.replace('-', ' ')\n", " text = re.sub(r'http\\S+', '', text)\n", " return text" ] }, { "cell_type": "code", "execution_count": 6, "id": "3ae40d24-bbe8-49c3-a3a9-70bde1b4d559", "metadata": {}, "outputs": [], "source": [ "comment_phab_df['processed_text'] = comment_phab_df['comment_text'].apply(preprocess_text)" ] }, { "cell_type": "code", "execution_count": 7, "id": "b8eddf40-1fe2-4fce-be74-b32552b40c57", "metadata": {}, "outputs": [], "source": [ "comment_phab_df['processed_resolved_text'] = comment_phab_df['resolved_text'].apply(preprocess_text)" ] }, { "cell_type": "code", "execution_count": 8, "id": "a8469b16-4ae6-4b06-bf1b-1f2f6c736cab", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")\n", "\n", "def extract_dependency_tree(text):\n", " doc = nlp(text)\n", " dependency_trees = []\n", " \n", " for sentence in doc.sents:\n", " for token in sentence:\n", " token_info = (\n", " token.text, \n", " token.lemma_, \n", " token.dep_, \n", " token.head.text, \n", " list(token.ancestors), \n", " list(token.subtree), \n", " list(token.children)\n", " )\n", " dependency_trees.append(token_info)\n", " \n", " return dependency_trees" ] }, { "cell_type": "code", "execution_count": 9, "id": "8b9a12f9-71bf-4bc9-bcfd-c73aab4be920", "metadata": {}, "outputs": [], "source": [ "comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)" ] }, { "cell_type": "code", "execution_count": 10, "id": "337a528a-5667-4e1f-ac9a-37caabc03a18", "metadata": {}, "outputs": [], "source": [ "comment_phab_df['resolved_dependency_tree'] = comment_phab_df['processed_resolved_text'].apply(extract_dependency_tree)" ] }, { "cell_type": "code", "execution_count": 11, "id": "a3f5d40b-f56e-4e31-a7f9-40b7ddb4d2a4", "metadata": {}, "outputs": [], "source": [ "#get VAD scores\n", "import numpy as np\n", "#https://saifmohammad.com/WebPages/nrc-vad.html\n", "column_headings = ['Word', 'Valence', 'Arousal', 'Domination']\n", "vad_lexicon = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\\t', header=None, names=column_headings)\n", "vad_dict = vad_lexicon.set_index('Word').T.to_dict()\n", "\n", "def vad_scoring(dependency_tree):\n", " valence = []\n", " arousal = []\n", " dominance = []\n", " for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n", " if lemma in vad_dict:\n", " valence.append(vad_dict[lemma]['Valence'])\n", " arousal.append(vad_dict[lemma]['Arousal'])\n", " dominance.append(vad_dict[lemma]['Domination'])\n", "\n", " # Compute average scores across the comment\n", " avg_valence = np.mean(valence) if valence else 0\n", " avg_arousal = np.mean(arousal) if arousal else 0\n", " avg_dominance = np.mean(dominance) if dominance else 0\n", "\n", " return [avg_valence, avg_arousal, avg_dominance]\n", "\n", "def dominance_prevail(dependency_tree):\n", " dominant_words = 0 \n", " for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n", " if lemma in vad_dict:\n", " if vad_dict[lemma]['Domination'] >= 0.75:\n", " dominant_words += 1\n", " if vad_dict[lemma]['Domination'] <= 0.25:\n", " dominant_words += 1\n", " return dominant_words\n", "\n", "def arousal_prevail(dependency_tree):\n", " arousal_words = 0 \n", " for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n", " if lemma in vad_dict:\n", " if vad_dict[lemma]['Arousal'] >= 0.75:\n", " arousal_words += 1\n", " if vad_dict[lemma]['Arousal'] <= 0.25:\n", " arousal_words += 1\n", " return arousal_words\n", "\n", "def valence_prevail(dependency_tree):\n", " valence_words = 0 \n", " for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n", " if lemma in vad_dict:\n", " if vad_dict[lemma]['Valence'] >= 0.75:\n", " valence_words += 1\n", " if vad_dict[lemma]['Valence'] <= 0.25:\n", " valence_words += 1\n", " return valence_words\n", " " ] }, { "cell_type": "code", "execution_count": 12, "id": "828fb57a-e152-42ef-9c60-660648898532", "metadata": {}, "outputs": [], "source": [ "#establishing per-comment VAD scores \n", "comment_phab_df['avg_vad_scores'] = comment_phab_df['dependency_tree'].apply(vad_scoring)\n", "comment_phab_df['dominant_wc'] = comment_phab_df['dependency_tree'].apply(dominance_prevail)\n", "comment_phab_df['arousal_wc'] = comment_phab_df['dependency_tree'].apply(arousal_prevail)\n", "comment_phab_df['valence_wc'] = comment_phab_df['dependency_tree'].apply(valence_prevail)" ] }, { "cell_type": "code", "execution_count": 13, "id": "27e47f6f-0257-4b70-b222-e91ef888c900", "metadata": {}, "outputs": [], "source": [ "comment_phab_df[['average_v_score', 'average_a_score', 'average_d_score']] = pd.DataFrame(comment_phab_df['avg_vad_scores'].tolist(), index=comment_phab_df.index)\n", "comment_phab_df = comment_phab_df.drop(columns=['avg_vad_scores'])" ] }, { "cell_type": "code", "execution_count": 14, "id": "1889034d-bc93-495f-bdc4-961d193d3e08", "metadata": {}, "outputs": [], "source": [ "def token_http_relevant(word):\n", " # expanded dictionary for relevancy\n", " # http, login, SSL, TLS, certificate \n", " if \"://\" not in word.lower():\n", " #http\n", " if \"http\" in word.lower():\n", " return True\n", " #login\n", " if \"login\" in word.lower():\n", " return True\n", " #ssl\n", " if \"ssl\" in word.lower():\n", " return True\n", " #tls\n", " if \"tls\" in word.lower():\n", " return True\n", " #cert\n", " if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n", " return True\n", " return False" ] }, { "cell_type": "code", "execution_count": 15, "id": "09ddcbfc-b856-40ca-ad61-13577795d94b", "metadata": {}, "outputs": [], "source": [ "import datetime" ] }, { "cell_type": "code", "execution_count": 16, "id": "184ccbe6-0a7a-41b8-9b02-bc439ff975d0", "metadata": {}, "outputs": [], "source": [ "# expand the dependency parser \n", "dependency_relations = []\n", "resolved_dependency_relations = []\n", "\n", "for index, row in comment_phab_df.iterrows():\n", " text = row['comment_text']\n", " timestamp = row['timestamp']\n", " comment_id = row['id']\n", " conversation_id = row['conversation_id']\n", " WMFaffil = row['meta.affil']\n", " \n", " for token, lemma, dep, head, ancestors, subtree, children in row['dependency_tree']:\n", " dependency_relations.append({\n", " 'comment_id': comment_id,\n", " 'timestamp': timestamp,\n", " 'wmfAffil':WMFaffil,\n", " 'token': token,\n", " 'dependency': dep,\n", " 'head': head,\n", " 'depth': len(list(ancestors)), \n", " 'children': len(list(children)) \n", " })\n", " \n", " for token, lemma, dep, head, ancestors, subtree, children in row['resolved_dependency_tree']:\n", " resolved_dependency_relations.append({\n", " 'comment_id': comment_id,\n", " 'timestamp': timestamp,\n", " 'wmfAffil':WMFaffil,\n", " 'token': token,\n", " 'dependency': dep,\n", " 'head': head,\n", " 'depth': len(list(ancestors)), \n", " 'children': len(list(children)) \n", " })\n", "\n", "resolved_dependency_relations_df = pd.DataFrame(resolved_dependency_relations) \n", "dependency_relations_df = pd.DataFrame(dependency_relations)" ] }, { "cell_type": "code", "execution_count": 17, "id": "82498686-14f4-40c8-9e33-27b31f115b47", "metadata": {}, "outputs": [], "source": [ "#now analysis/plotting \n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from matplotlib.gridspec import GridSpec" ] }, { "cell_type": "code", "execution_count": 18, "id": "5a91a59a-0d1c-48b3-93dd-b9df76ca68e5", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'affective_comment_phab_df' is not defined", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m plot2 = sns.lmplot(data=\u001b[43maffective_comment_phab_df\u001b[49m, x=\u001b[33m\"\u001b[39m\u001b[33mspeakers_comment\u001b[39m\u001b[33m\"\u001b[39m, y=\u001b[33m\"\u001b[39m\u001b[33mpolarized_wc\u001b[39m\u001b[33m\"\u001b[39m, hue=\u001b[33m\"\u001b[39m\u001b[33mdate_group\u001b[39m\u001b[33m\"\u001b[39m, col=\u001b[33m\"\u001b[39m\u001b[33mmeta.affil\u001b[39m\u001b[33m\"\u001b[39m, scatter=\u001b[38;5;28;01mFalse\u001b[39;00m, legend=\u001b[38;5;28;01mFalse\u001b[39;00m, palette=palette)\n\u001b[32m 2\u001b[39m plot2.set_axis_labels(\u001b[33m\"\u001b[39m\u001b[33mIndex of Speaker\u001b[39m\u001b[33m'\u001b[39m\u001b[33ms Comment\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mCount of Polarized Words\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 3\u001b[39m plot2.set_titles(col_template=\u001b[33m\"\u001b[39m\u001b[33mWMF Affiliation: \u001b[39m\u001b[38;5;132;01m{col_name}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", "\u001b[31mNameError\u001b[39m: name 'affective_comment_phab_df' is not defined" ] } ], "source": [ "plot2 = sns.lmplot(data=affective_comment_phab_df, x=\"speakers_comment\", y=\"polarized_wc\", hue=\"date_group\", col=\"meta.affil\", scatter=False, legend=False, palette=palette)\n", "plot2.set_axis_labels(\"Index of Speaker's Comment\", \"Count of Polarized Words\")\n", "plot2.set_titles(col_template=\"WMF Affiliation: {col_name}\")\n", "plot2.fig.subplots_adjust(top=0.9) # Adjust subplots to make room for the title\n", "plot2.add_legend(title=\"Comment publication timestamp:\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "2274795e-c64d-43e4-b0f5-a19b5b8ba2c8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | comment_id | \n", "timestamp | \n", "wmfAffil | \n", "token | \n", "dependency | \n", "head | \n", "depth | \n", "children | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "243215 | \n", "2014-09-24 06:48:00+00:00 | \n", "False | \n", "User | \n", "dep | \n", "reported | \n", "1 | \n", "0 | \n", "
1 | \n", "243215 | \n", "2014-09-24 06:48:00+00:00 | \n", "False | \n", ": | \n", "punct | \n", "reported | \n", "1 | \n", "0 | \n", "
2 | \n", "243215 | \n", "2014-09-24 06:48:00+00:00 | \n", "False | \n", "NickK | \n", "nsubj | \n", "reported | \n", "1 | \n", "0 | \n", "
3 | \n", "243215 | \n", "2014-09-24 06:48:00+00:00 | \n", "False | \n", "reported | \n", "ROOT | \n", "reported | \n", "0 | \n", "7 | \n", "
4 | \n", "243215 | \n", "2014-09-24 06:48:00+00:00 | \n", "False | \n", "in | \n", "prep | \n", "reported | \n", "1 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1708744 | \n", "406877 | \n", "2014-10-17 09:17:17+00:00 | \n", "False | \n", "n't | \n", "neg | \n", "do | \n", "1 | \n", "0 | \n", "
1708745 | \n", "406877 | \n", "2014-10-17 09:17:17+00:00 | \n", "False | \n", "really | \n", "advmod | \n", "do | \n", "1 | \n", "0 | \n", "
1708746 | \n", "406877 | \n", "2014-10-17 09:17:17+00:00 | \n", "False | \n", "do | \n", "ROOT | \n", "do | \n", "0 | \n", "9 | \n", "
1708747 | \n", "406877 | \n", "2014-10-17 09:17:17+00:00 | \n", "False | \n", "much | \n", "dobj | \n", "do | \n", "1 | \n", "0 | \n", "
1708748 | \n", "406877 | \n", "2014-10-17 09:17:17+00:00 | \n", "False | \n", ". | \n", "punct | \n", "do | \n", "1 | \n", "0 | \n", "
1708749 rows × 8 columns
\n", "