990 lines
299 KiB
Plaintext
990 lines
299 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "ba9e5acd-e17d-4318-9272-04c9f6706186",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd \n",
|
|
"import spacy"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "e4f0b3f0-5255-46f1-822f-e455087ba315",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0422_http_phab_comments.csv\"\n",
|
|
"phab_df = pd.read_csv(phab_path)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "ac5e624b-08a4-4ede-bc96-cfc26c3edac3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def http_relevant(text):\n",
|
|
" if pd.isnull(text):\n",
|
|
" return False\n",
|
|
" # expanded dictionary for relevancy\n",
|
|
" # http, login, SSL, TLS, certificate \n",
|
|
" for word in text.split():\n",
|
|
" if \"://\" not in word.lower():\n",
|
|
" #http\n",
|
|
" if \"http\" in word.lower():\n",
|
|
" return True\n",
|
|
" #login\n",
|
|
" if \"login\" in word.lower():\n",
|
|
" return True\n",
|
|
" #ssl\n",
|
|
" if \"ssl\" in word.lower():\n",
|
|
" return True\n",
|
|
" #tls\n",
|
|
" if \"tls\" in word.lower():\n",
|
|
" return True\n",
|
|
" #cert\n",
|
|
" if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n",
|
|
" return True\n",
|
|
" return False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "d5925c49-ea1d-4813-98aa-eae10d5879ca",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def is_migrated(comment_text):\n",
|
|
" if pd.isnull(comment_text):\n",
|
|
" return False\n",
|
|
" text = comment_text.strip()\n",
|
|
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
|
|
" return True \n",
|
|
" return False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "d449164e-1d28-4580-9eb1-f0f69978f114",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_34086/836739196.py:41: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
|
"/tmp/ipykernel_34086/836739196.py:44: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
|
|
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
|
|
"\n",
|
|
"#cleaning df\n",
|
|
"phab_df['id'] = phab_df.index + 1\n",
|
|
"#may have to build out the reply_to column \n",
|
|
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
|
|
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
|
|
"\n",
|
|
"phab_df = phab_df.rename(columns={\n",
|
|
" 'AuthorPHID': 'speaker',\n",
|
|
" 'TaskPHID': 'conversation_id',\n",
|
|
" 'WMFaffil':'meta.affil',\n",
|
|
" 'isGerrit': 'meta.gerrit'\n",
|
|
"})\n",
|
|
"\n",
|
|
"# after 07-01-2013 before 10-01-2015\n",
|
|
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
|
|
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] > 1372636800)]\n",
|
|
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]\n",
|
|
"\n",
|
|
"#removing headless conversations\n",
|
|
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
|
|
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
|
|
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
|
|
"\n",
|
|
"#removing gerrit comments \n",
|
|
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
|
|
"\n",
|
|
"# filter out the sourceforge migration \n",
|
|
"# Originally from: http://sourceforge.net in the task task_summary\n",
|
|
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
|
|
"\n",
|
|
"#cut down to only the data that is relevant (mentions http)\n",
|
|
"relevant_conversation_ids = task_phab_df[\n",
|
|
" task_phab_df['comment_text'].apply(http_relevant) |\n",
|
|
" task_phab_df['task_title'].apply(http_relevant)\n",
|
|
"]['conversation_id'].unique()\n",
|
|
"\n",
|
|
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
|
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
|
"\n",
|
|
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
|
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
|
"\n",
|
|
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
|
|
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
|
|
"#comment_phab_df = mid_comment_phab_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "942344db-c8f5-4ed6-a757-c97f8454f18b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Unique conversation_ids: 2281\n",
|
|
"Unique ids: 14490\n",
|
|
"Unique speakers: 634\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n",
|
|
"unique_ids = len(comment_phab_df['id'].unique())\n",
|
|
"unique_speakers = len(comment_phab_df['speaker'].unique())\n",
|
|
"\n",
|
|
"print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n",
|
|
"print(f\"Unique ids: {unique_ids}\")\n",
|
|
"print(f\"Unique speakers: {unique_speakers}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "d226d781-b002-4842-a3ae-92d4851a5878",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"def preprocess_text(text):\n",
|
|
" text = str(text)\n",
|
|
" text = text.replace('*', ' ')\n",
|
|
" text = text.replace('-', ' ')\n",
|
|
" text = re.sub(r'http\\S+', '', text)\n",
|
|
" return text"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "3ae40d24-bbe8-49c3-a3a9-70bde1b4d559",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_34086/2783900859.py:1: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df['processed_text'] = comment_phab_df['comment_text'].apply(preprocess_text)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"comment_phab_df['processed_text'] = comment_phab_df['comment_text'].apply(preprocess_text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b8eddf40-1fe2-4fce-be74-b32552b40c57",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#comment_phab_df['processed_resolved_text'] = comment_phab_df['resolved_text'].apply(preprocess_text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "a8469b16-4ae6-4b06-bf1b-1f2f6c736cab",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"nlp = spacy.load(\"en_core_web_sm\")\n",
|
|
"\n",
|
|
"def extract_dependency_tree(text):\n",
|
|
" doc = nlp(text)\n",
|
|
" dependency_trees = []\n",
|
|
" \n",
|
|
" for sentence in doc.sents:\n",
|
|
" for token in sentence:\n",
|
|
" token_info = (\n",
|
|
" token.text, \n",
|
|
" token.lemma_, \n",
|
|
" token.dep_, \n",
|
|
" token.head.text, \n",
|
|
" list(token.ancestors), \n",
|
|
" list(token.subtree), \n",
|
|
" list(token.children)\n",
|
|
" )\n",
|
|
" dependency_trees.append(token_info)\n",
|
|
" \n",
|
|
" return dependency_trees"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "8b9a12f9-71bf-4bc9-bcfd-c73aab4be920",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_34086/2805711855.py:1: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"comment_phab_df['dependency_tree'] = comment_phab_df['processed_text'].apply(extract_dependency_tree)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "337a528a-5667-4e1f-ac9a-37caabc03a18",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#comment_phab_df['resolved_dependency_tree'] = comment_phab_df['processed_resolved_text'].apply(extract_dependency_tree)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "a3f5d40b-f56e-4e31-a7f9-40b7ddb4d2a4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#get VAD scores\n",
|
|
"import numpy as np\n",
|
|
"#https://saifmohammad.com/WebPages/nrc-vad.html\n",
|
|
"column_headings = ['Word', 'Valence', 'Arousal', 'Domination']\n",
|
|
"vad_lexicon = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\\t', header=None, names=column_headings)\n",
|
|
"vad_dict = vad_lexicon.set_index('Word').T.to_dict()\n",
|
|
"\n",
|
|
"def vad_scoring(dependency_tree):\n",
|
|
" valence = []\n",
|
|
" arousal = []\n",
|
|
" dominance = []\n",
|
|
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
|
" if lemma in vad_dict:\n",
|
|
" valence.append(vad_dict[lemma]['Valence'])\n",
|
|
" arousal.append(vad_dict[lemma]['Arousal'])\n",
|
|
" dominance.append(vad_dict[lemma]['Domination'])\n",
|
|
"\n",
|
|
" # Compute average scores across the comment\n",
|
|
" avg_valence = np.mean(valence) if valence else 0\n",
|
|
" avg_arousal = np.mean(arousal) if arousal else 0\n",
|
|
" avg_dominance = np.mean(dominance) if dominance else 0\n",
|
|
"\n",
|
|
" return [avg_valence, avg_arousal, avg_dominance]\n",
|
|
"\n",
|
|
"def dominance_prevail(dependency_tree):\n",
|
|
" dominant_words = 0 \n",
|
|
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
|
" if lemma in vad_dict:\n",
|
|
" if vad_dict[lemma]['Domination'] >= 0.75:\n",
|
|
" dominant_words += 1\n",
|
|
" if vad_dict[lemma]['Domination'] <= 0.25:\n",
|
|
" dominant_words += 1\n",
|
|
" return dominant_words\n",
|
|
"\n",
|
|
"def arousal_prevail(dependency_tree):\n",
|
|
" arousal_words = 0 \n",
|
|
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
|
" if lemma in vad_dict:\n",
|
|
" if vad_dict[lemma]['Arousal'] >= 0.75:\n",
|
|
" arousal_words += 1\n",
|
|
" if vad_dict[lemma]['Arousal'] <= 0.25:\n",
|
|
" arousal_words += 1\n",
|
|
" return arousal_words\n",
|
|
"\n",
|
|
"def valence_prevail(dependency_tree):\n",
|
|
" valence_words = 0 \n",
|
|
" for token, lemma, dep, head, ancestors, subtree, children in dependency_tree:\n",
|
|
" if lemma in vad_dict:\n",
|
|
" if vad_dict[lemma]['Valence'] >= 0.75:\n",
|
|
" valence_words += 1\n",
|
|
" if vad_dict[lemma]['Valence'] <= 0.25:\n",
|
|
" valence_words += 1\n",
|
|
" return valence_words\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "828fb57a-e152-42ef-9c60-660648898532",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_34086/2858732056.py:2: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df['avg_vad_scores'] = comment_phab_df['dependency_tree'].apply(vad_scoring)\n",
|
|
"/tmp/ipykernel_34086/2858732056.py:3: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df['dominant_wc'] = comment_phab_df['dependency_tree'].apply(dominance_prevail)\n",
|
|
"/tmp/ipykernel_34086/2858732056.py:4: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df['arousal_wc'] = comment_phab_df['dependency_tree'].apply(arousal_prevail)\n",
|
|
"/tmp/ipykernel_34086/2858732056.py:5: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df['valence_wc'] = comment_phab_df['dependency_tree'].apply(valence_prevail)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#establishing per-comment VAD scores \n",
|
|
"comment_phab_df['avg_vad_scores'] = comment_phab_df['dependency_tree'].apply(vad_scoring)\n",
|
|
"comment_phab_df['dominant_wc'] = comment_phab_df['dependency_tree'].apply(dominance_prevail)\n",
|
|
"comment_phab_df['arousal_wc'] = comment_phab_df['dependency_tree'].apply(arousal_prevail)\n",
|
|
"comment_phab_df['valence_wc'] = comment_phab_df['dependency_tree'].apply(valence_prevail)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "27e47f6f-0257-4b70-b222-e91ef888c900",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_34086/335308388.py:1: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df[['average_v_score', 'average_a_score', 'average_d_score']] = pd.DataFrame(comment_phab_df['avg_vad_scores'].tolist(), index=comment_phab_df.index)\n",
|
|
"/tmp/ipykernel_34086/335308388.py:1: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df[['average_v_score', 'average_a_score', 'average_d_score']] = pd.DataFrame(comment_phab_df['avg_vad_scores'].tolist(), index=comment_phab_df.index)\n",
|
|
"/tmp/ipykernel_34086/335308388.py:1: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" comment_phab_df[['average_v_score', 'average_a_score', 'average_d_score']] = pd.DataFrame(comment_phab_df['avg_vad_scores'].tolist(), index=comment_phab_df.index)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"comment_phab_df[['average_v_score', 'average_a_score', 'average_d_score']] = pd.DataFrame(comment_phab_df['avg_vad_scores'].tolist(), index=comment_phab_df.index)\n",
|
|
"comment_phab_df = comment_phab_df.drop(columns=['avg_vad_scores'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "184ccbe6-0a7a-41b8-9b02-bc439ff975d0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# expand the dependency parser \n",
|
|
"\n",
|
|
"#pattern = r'\\b(ve|VE|visualeditor|VisualEditor)\\b'\n",
|
|
"#pattern = r'\\b(WMF|Foundation)\\b'\n",
|
|
"#pattern = r'\\b(bots|scripts|gadgets)\\b'\n",
|
|
"pattern = r'\\b(http|https)\\b'\n",
|
|
"\n",
|
|
"dependency_relations = []\n",
|
|
"resolved_dependency_relations = []\n",
|
|
"\n",
|
|
"for index, row in comment_phab_df.iterrows():\n",
|
|
" text = row['comment_text']\n",
|
|
" timestamp = row['timestamp']\n",
|
|
" comment_id = row['id']\n",
|
|
" conversation_id = row['conversation_id']\n",
|
|
" WMFaffil = row['meta.affil']\n",
|
|
" \n",
|
|
" for token, lemma, dep, head, ancestors, subtree, children in row['dependency_tree']:\n",
|
|
" if re.search(pattern, token, re.IGNORECASE):\n",
|
|
" dependency_relations.append({\n",
|
|
" 'comment_id': comment_id,\n",
|
|
" 'timestamp': timestamp,\n",
|
|
" 'wmfAffil':WMFaffil,\n",
|
|
" 'token': token,\n",
|
|
" 'dependency': dep,\n",
|
|
" 'head': head,\n",
|
|
" 'depth': len(list(ancestors)), \n",
|
|
" 'children': len(list(children)) \n",
|
|
" })\n",
|
|
" ''' \n",
|
|
" for token, lemma, dep, head, ancestors, subtree, children in row['resolved_dependency_tree']:\n",
|
|
" if re.search(pattern, token, re.IGNORECASE):\n",
|
|
" resolved_dependency_relations.append({\n",
|
|
" 'comment_id': comment_id,\n",
|
|
" 'timestamp': timestamp,\n",
|
|
" 'wmfAffil':WMFaffil,\n",
|
|
" 'token': token,\n",
|
|
" 'dependency': dep,\n",
|
|
" 'head': head,\n",
|
|
" 'depth': len(list(ancestors)), \n",
|
|
" 'children': len(list(children)) \n",
|
|
" })\n",
|
|
" '''\n",
|
|
"#resolved_dependency_relations_df = pd.DataFrame(resolved_dependency_relations) \n",
|
|
"dependency_relations_df = pd.DataFrame(dependency_relations)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "82498686-14f4-40c8-9e33-27b31f115b47",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#now analysis/plotting \n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"from matplotlib.gridspec import GridSpec\n",
|
|
"import matplotlib.ticker as ticker\n",
|
|
"import matplotlib.dates as mdates"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "9833922d-d69a-4f8d-96ed-b25eea626114",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"given_date = pd.Timestamp(\"2015-07-02\").tz_localize(None)\n",
|
|
"task_phab_df['timestamp'] = pd.to_datetime(task_phab_df['timestamp'], unit='s').dt.tz_localize(None)\n",
|
|
"#task_phab_df = task_phab_df[(task_phab_df['date_created'] < 1380585599) & (task_phab_df['date_created'] > 1352592000)]\n",
|
|
"task_phab_df['week_bin'] = ((task_phab_df['timestamp'] - given_date).dt.days // 7)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "ebd80040-8e9b-49f3-9eea-5643bdf12f5b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"task_phab_df\n",
|
|
"#task_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/phab_tasks.csv\", index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "82cd9dde-0d14-4de5-8482-5a39de8d2869",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_96995/627627281.py:7: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
|
|
" task_phab_df['week'] = task_phab_df['timestamp'].dt.to_period('W').dt.start_time\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 1000x600 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"plt.figure(figsize=(10, 6))\n",
|
|
"#task_phab_df = phab_df[phab_df['comment_type']==\"task_description\"]\n",
|
|
"task_phab_df = task_phab_df[task_phab_df['is_relevant'] == True]\n",
|
|
"task_phab_df['first_comment'] = task_phab_df.groupby('speaker')['timestamp'].rank(method='first') <= 5\n",
|
|
"#task_phab_df = task_phab_df[(task_phab_df['date_created'] < 1383264000) & (task_phab_df['date_created'] > 1351728000)]\n",
|
|
"\n",
|
|
"task_phab_df['week'] = task_phab_df['timestamp'].dt.to_period('W').dt.start_time\n",
|
|
"unique_taskPHIDs = task_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
|
"\n",
|
|
"wmf_task_phab_df = task_phab_df[(task_phab_df['meta.affil'] == True)]\n",
|
|
"wmf_tasks = wmf_task_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
|
"\n",
|
|
"other_task_phab_df = task_phab_df[(task_phab_df['meta.affil'] != True)]\n",
|
|
"other_tasks = other_task_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
|
"\n",
|
|
"'''\n",
|
|
"task_phab_df['speakers_task'] = task_phab_df.groupby('speaker')['timestamp'].rank(method='first').astype(int)\n",
|
|
"\n",
|
|
"# Filter dates 06-12-2015 to 10-01-2015\n",
|
|
"bounded_task_phab_df = task_phab_df[(task_phab_df['date_created'] < 1443743999) & (task_phab_df['date_created'] > 1434067200)]\n",
|
|
"# Bin the speakers based on the number of tasks they created\n",
|
|
"bins = [0, 6, 26, 51, float('inf')]\n",
|
|
"labels = ['0-5', '6-25', '26-50', '51+']\n",
|
|
"min_speakers_task = bounded_task_phab_df.groupby('speaker')['speakers_task'].min().reset_index()\n",
|
|
"min_speakers_task = bounded_task_phab_df.rename(columns={'speakers_task': 'min_speakers_task'})\n",
|
|
"bounded_task_phab_df = bounded_task_phab_df.merge(min_speakers_task, on='speaker', how='left')\n",
|
|
"bounded_task_phab_df['task_bins'] = pd.cut(bounded_task_phab_df ['min_speakers_task'], bins=bins, labels=labels, right=False)\n",
|
|
"print(bounded_task_phab_df)\n",
|
|
"bounded_task_phab_df['week'] = bounded_task_phab_df['timestamp_y'].dt.to_period('W').dt.start_time\n",
|
|
"weekly_breakdown = bounded_task_phab_df.groupby(['week', 'task_bins']).size().unstack(fill_value=0)\n",
|
|
"speaker_breakdown = bounded_task_phab_df.groupby(['week', 'task_bins']).nunique()['speaker'].unstack(fill_value=0)\n",
|
|
"\n",
|
|
"# Reshape the DataFrame for use with Seaborn\n",
|
|
"weekly_breakdown = weekly_breakdown.reset_index().melt(id_vars='week', value_vars=labels, var_name='task_bins', value_name='count')\n",
|
|
"speaker_breakdown = speaker_breakdown.reset_index().melt(id_vars='week', value_vars=labels, var_name='task_bins', value_name='speakers')\n",
|
|
"\n",
|
|
"rookie_bounded_task_phab_df = weekly_breakdown[weekly_breakdown['task_bins'] == '0-5']\n",
|
|
"\n",
|
|
"#unaff_new_tasks_phab_df = task_phab_df[(task_phab_df['first_comment'] == True) & (task_phab_df['meta.affil'] != True)]\n",
|
|
"#unaff_new_tasks = unaff_new_tasks_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
|
"\n",
|
|
"#aff_new_tasks_phab_df = task_phab_df[(task_phab_df['first_comment'] == True) & (task_phab_df['meta.affil'] == True)]\n",
|
|
"#aff_new_tasks = aff_new_tasks_phab_df.groupby('week')['conversation_id'].nunique()\n",
|
|
"\n",
|
|
"#sns.lineplot(x=unique_taskPHIDs.index, y=unique_taskPHIDs.values, color='black', label='Total', marker='o')\n",
|
|
"#sns.lineplot(x=wmf_tasks.index, y=wmf_tasks.values, color='#c7756a', label='WMF-affiliated authors', marker='o')\n",
|
|
"#sns.lineplot(x=other_tasks.index, y=other_tasks.values, color='#5da2d8', label='Nonaffiliated authors', marker='o')\n",
|
|
"#sns.lineplot(x=aff_new_tasks.index, y=aff_new_tasks.values, color='#c7756a',linestyle=\"dotted\", label=\"WMF-affiliated new authors\", marker='x')\n",
|
|
"#sns.lineplot(x=unaff_new_tasks.index, y=unaff_new_tasks.values, color='#5da2d8', linestyle=\"dotted\", label=\"Nonaffiliated new authors\", marker='x')\n",
|
|
"'''\n",
|
|
"#sns.barplot(x=unique_taskPHIDs.index, y=unique_taskPHIDs.values, color='black', label='Total')\n",
|
|
"sns.barplot(x=other_tasks.index, y=other_tasks.values, color='#5da2d8', label='Nonaffiliated authors')\n",
|
|
"sns.barplot(x=wmf_tasks.index, y=-wmf_tasks.values, color='#c7756a', label='WMF-affiliated authors')\n",
|
|
"#sns.lineplot(data=rookie_bounded_task_phab_df, x='week', y='count', color='green', label='Authors with ≤ 5 tasks', marker='o')\n",
|
|
"\n",
|
|
"plt.title('New Relevant Phabricator Tasks Indexed with HTTP')\n",
|
|
"plt.xlabel('Timestamp')\n",
|
|
"plt.ylabel('Unique taskPHIDs')\n",
|
|
"plt.xticks(rotation=90)\n",
|
|
"# Customize the x-axis for weekly labels\n",
|
|
"plt.grid(True)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"#plt.savefig('031825_new_tasks_fig.png')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "9a9b08a7-6c95-4971-b259-8e713c58fbe7",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_96995/3303796756.py:4: SettingWithCopyWarning: \n",
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
"\n",
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
" unaff_tasks_phab_df['speakers_task'] = unaff_tasks_phab_df.groupby('speaker')['timestamp'].rank(method='first').astype(int)\n",
|
|
"/tmp/ipykernel_96995/3303796756.py:17: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
|
|
" unaff_tasks_phab_df['week'] = unaff_tasks_phab_df['timestamp'].dt.to_period('W').dt.start_time\n",
|
|
"/tmp/ipykernel_96995/3303796756.py:18: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
|
|
" weekly_breakdown = unaff_tasks_phab_df.groupby(['week', 'task_bins']).size().unstack(fill_value=0)\n",
|
|
"/tmp/ipykernel_96995/3303796756.py:20: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
|
|
" speaker_breakdown = unaff_tasks_phab_df.groupby(['week', 'task_bins']).nunique()['speaker'].unstack(fill_value=0)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 1200x800 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"#task_phab_df = phab_df[phab_df['comment_type'] == \"task_description\"]\n",
|
|
"unaff_tasks_phab_df = task_phab_df[task_phab_df['meta.affil'] != True]\n",
|
|
"# Rank speaker's task values within each group\n",
|
|
"unaff_tasks_phab_df['speakers_task'] = unaff_tasks_phab_df.groupby('speaker')['timestamp'].rank(method='first').astype(int)\n",
|
|
"\n",
|
|
"# Filter dates 06-12-2015 to 10-01-2015\n",
|
|
"unaff_tasks_phab_df = unaff_tasks_phab_df[(unaff_tasks_phab_df['date_created'] < 1443743999) & (unaff_tasks_phab_df['date_created'] > 1434067200)]\n",
|
|
"# Bin the speakers based on the number of tasks they created\n",
|
|
"bins = [0, 6, 26, 51, float('inf')]\n",
|
|
"labels = ['0-5', '6-25', '26-50', '51+']\n",
|
|
"min_speakers_task = unaff_tasks_phab_df.groupby('speaker')['speakers_task'].min().reset_index()\n",
|
|
"min_speakers_task = min_speakers_task.rename(columns={'speakers_task': 'min_speakers_task'})\n",
|
|
"unaff_tasks_phab_df = unaff_tasks_phab_df.merge(min_speakers_task, on='speaker', how='left')\n",
|
|
"unaff_tasks_phab_df['task_bins'] = pd.cut(unaff_tasks_phab_df['min_speakers_task'], bins=bins, labels=labels, right=False)\n",
|
|
"\n",
|
|
"# Calculate the weekly breakdown of binned speakers_task values\n",
|
|
"unaff_tasks_phab_df['week'] = unaff_tasks_phab_df['timestamp'].dt.to_period('W').dt.start_time\n",
|
|
"weekly_breakdown = unaff_tasks_phab_df.groupby(['week', 'task_bins']).size().unstack(fill_value=0)\n",
|
|
"\n",
|
|
"speaker_breakdown = unaff_tasks_phab_df.groupby(['week', 'task_bins']).nunique()['speaker'].unstack(fill_value=0)\n",
|
|
"\n",
|
|
"# Reshape the DataFrame for use with Seaborn\n",
|
|
"weekly_breakdown = weekly_breakdown.reset_index().melt(id_vars='week', value_vars=labels, var_name='task_bins', value_name='count')\n",
|
|
"speaker_breakdown = speaker_breakdown.reset_index().melt(id_vars='week', value_vars=labels, var_name='task_bins', value_name='speakers')\n",
|
|
"\n",
|
|
"# Plot the stacked bar plot using Seaborn\n",
|
|
"plt.figure(figsize=(12, 8))\n",
|
|
"sns.barplot(data=weekly_breakdown, x='week', y='count', hue='task_bins', palette='colorblind')\n",
|
|
"#sns.barplot(data=speaker_breakdown, x='week', y='speakers', hue='task_bins', palette='colorblind')\n",
|
|
"plt.title(\"06-12-2015 to 10-01-2015 Weekly Unaffiliated Task Creation by Contributor Tenure\")\n",
|
|
"plt.xlabel('Week')\n",
|
|
"plt.ylabel('Tasks')\n",
|
|
"plt.legend(title=\"Contributor had created # tasks between 8-01-2013 and 06-12-2015:\")\n",
|
|
"plt.xticks(rotation=45)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"#plt.savefig('031625_weekly_tasks_by_history.png')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "b7cfad77-d48a-4708-91f3-89ae1179b90c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_34086/62586942.py:27: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
|
|
" comment_counts = affective_comment_phab_df.groupby('date_group').size()\n",
|
|
"/tmp/ipykernel_34086/62586942.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
|
|
" speaker_counts = affective_comment_phab_df.groupby('date_group')['speaker'].nunique()\n",
|
|
"/tmp/ipykernel_34086/62586942.py:35: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
|
|
" comment_counts_engaged = affective_comment_phab_df.groupby(['date_group', 'est_commenter', 'meta.affil']).size()\n",
|
|
"/tmp/ipykernel_34086/62586942.py:36: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
|
|
" speaker_counts_engaged = affective_comment_phab_df.groupby(['date_group', 'est_commenter', 'meta.affil'])['speaker'].nunique()\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Number of comments for each date group:\n",
|
|
"date_group\n",
|
|
"Before announcement 10614\n",
|
|
"After announcement, before deployment 802\n",
|
|
"After deployment 3074\n",
|
|
"dtype: int64\n",
|
|
"\n",
|
|
"Number of speakers for each date group:\n",
|
|
"date_group\n",
|
|
"Before announcement 521\n",
|
|
"After announcement, before deployment 142\n",
|
|
"After deployment 310\n",
|
|
"Name: speaker, dtype: int64\n",
|
|
"\n",
|
|
"Number of comments for each date group and engaged commenter subgroup:\n",
|
|
"date_group est_commenter meta.affil\n",
|
|
"Before announcement False False 10317\n",
|
|
" True 297\n",
|
|
"After announcement, before deployment False False 787\n",
|
|
" True 15\n",
|
|
"After deployment False False 2992\n",
|
|
" True 82\n",
|
|
"dtype: int64\n",
|
|
"\n",
|
|
"Number of speakers for each date group and engaged commenter subgroup:\n",
|
|
"date_group est_commenter meta.affil\n",
|
|
"Before announcement False False 518\n",
|
|
" True 56\n",
|
|
"After announcement, before deployment False False 138\n",
|
|
" True 7\n",
|
|
"After deployment False False 305\n",
|
|
" True 24\n",
|
|
"Name: speaker, dtype: int64\n",
|
|
"\n",
|
|
"Number of comments for each engaged commenter subgroup, and WMF affiliation:\n",
|
|
"est_commenter meta.affil\n",
|
|
"False False 14096\n",
|
|
" True 394\n",
|
|
"dtype: int64\n",
|
|
"\n",
|
|
"Number of speakers for each engaged commenter subgroup, and WMF affiliation:\n",
|
|
"est_commenter meta.affil\n",
|
|
"False False 627\n",
|
|
" True 75\n",
|
|
"Name: speaker, dtype: int64\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'\\nplot1 = sns.lmplot(data=comment_phab_df, x=\"date_created\", y=\"dominant_wc\", hue=\"date_group\", col=\"meta.affil\", row=\\'new_commenter\\', scatter=False, legend=False, palette=palette)\\nplot1.set_axis_labels(\"Timestamp\", \"Count of Dominance Polarized Words\")\\nplot1.set_titles(row_template=\"Author\\'s 100+ Comment: {row_name}\",col_template=\"WMF Affiliation: {col_name}\")\\nplot1.fig.subplots_adjust(top=0.9) # Adjust subplots to make room for the title\\nplot1.add_legend(title=\"Comment publication timestamp:\")\\nfig1 = plot1.fig\\n# Plot for arousal_wc\\nplot2 = sns.lmplot(data=comment_phab_df, x=\"date_created\", y=\"arousal_wc\", hue=\"date_group\", col=\"meta.affil\", row=\\'engaged_commenter\\', scatter=False, legend=False, palette=palette)\\nplot2.set_axis_labels(\"Timestamp\", \"Count of Arousal Polarized Words\")\\nplot2.set_titles(row_template=\"Author\\'s 100+ Comment: {row_name}\",col_template=\"WMF Affiliation: {col_name}\")\\nplot2.add_legend(title=\"Comment publication timestamp:\")\\n#plot2.add_legend(title=\"Before/After 07/01/2013 Wide Release\")\\n\\nplot3 = sns.lmplot(data=comment_phab_df, x=\"date_created\", y=\"valence_wc\", hue=\"date_group\", col=\"meta.affil\", row=\\'engaged_commenter\\', scatter=False, legend=False, palette=palette)\\nplot3.set_axis_labels(\"Timestamp\", \"Count of Valence Polarized Words\")\\nplot3.set_titles(row_template=\"Author\\'s 100+ Comment: {row_name}\",col_template=\"WMF Affiliation: {col_name}\")\\nplot3.add_legend(title=\"Comment publication timestamp:\")\\n'"
|
|
]
|
|
},
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 1333.5x500 with 2 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"bins = [\n",
|
|
" pd.Timestamp('1900-01-01 00:00:01+00:00'),\n",
|
|
" pd.Timestamp('2015-06-12 00:00:01+00:00'),\n",
|
|
" pd.Timestamp('2015-07-02 00:00:01+00:00'),\n",
|
|
" pd.Timestamp('2100-08-28 00:00:01+00:00')\n",
|
|
"]\n",
|
|
"labels = ['Before announcement', 'After announcement, before deployment', 'After deployment']\n",
|
|
"\n",
|
|
"#creating variables of interest\n",
|
|
"affective_comment_phab_df = comment_phab_df\n",
|
|
"affective_comment_phab_df['date_group'] = pd.cut(affective_comment_phab_df['timestamp'], bins=bins, labels=labels, right=False)\n",
|
|
"affective_comment_phab_df['speakers_comment'] = affective_comment_phab_df.groupby('speaker')['timestamp'].rank(method='first').astype(int)\n",
|
|
"#all comments prior to june 1 2013\n",
|
|
"subset_comment_phab_df = affective_comment_phab_df[affective_comment_phab_df['date_created'] <= 1370044800]\n",
|
|
"#getting counts \n",
|
|
"comment_counts = subset_comment_phab_df.groupby('speaker')['speakers_comment'].max().reset_index()\n",
|
|
"comment_counts = comment_counts.rename(columns={'speakers_comment': 'pre_june_2013_comments'})\n",
|
|
"#merge back \n",
|
|
"affective_comment_phab_df = affective_comment_phab_df.merge(comment_counts, on='speaker', how='left')\n",
|
|
"affective_comment_phab_df['pre_june_2013_comments'] = affective_comment_phab_df['pre_june_2013_comments'].fillna(0)\n",
|
|
"\n",
|
|
"affective_comment_phab_df['new_commenter'] = affective_comment_phab_df['pre_june_2013_comments'] <= 10\n",
|
|
"affective_comment_phab_df['est_commenter'] = affective_comment_phab_df['pre_june_2013_comments'] > 50\n",
|
|
"\n",
|
|
"palette = ['#31449c', '#4a7c85', '#c5db68']\n",
|
|
"\n",
|
|
"comment_counts = affective_comment_phab_df.groupby('date_group').size()\n",
|
|
"speaker_counts = affective_comment_phab_df.groupby('date_group')['speaker'].nunique()\n",
|
|
"\n",
|
|
"print(\"Number of comments for each date group:\")\n",
|
|
"print(comment_counts)\n",
|
|
"print(\"\\nNumber of speakers for each date group:\")\n",
|
|
"print(speaker_counts)\n",
|
|
"\n",
|
|
"comment_counts_engaged = affective_comment_phab_df.groupby(['date_group', 'est_commenter', 'meta.affil']).size()\n",
|
|
"speaker_counts_engaged = affective_comment_phab_df.groupby(['date_group', 'est_commenter', 'meta.affil'])['speaker'].nunique()\n",
|
|
"\n",
|
|
"print(\"\\nNumber of comments for each date group and engaged commenter subgroup:\")\n",
|
|
"print(comment_counts_engaged)\n",
|
|
"print(\"\\nNumber of speakers for each date group and engaged commenter subgroup:\")\n",
|
|
"print(speaker_counts_engaged)\n",
|
|
"\n",
|
|
"comment_counts_wmf = affective_comment_phab_df.groupby(['est_commenter', 'meta.affil']).size()\n",
|
|
"speaker_counts_wmf = affective_comment_phab_df.groupby(['est_commenter', 'meta.affil'])['speaker'].nunique()\n",
|
|
"\n",
|
|
"print(\"\\nNumber of comments for each engaged commenter subgroup, and WMF affiliation:\")\n",
|
|
"print(comment_counts_wmf)\n",
|
|
"print(\"\\nNumber of speakers for each engaged commenter subgroup, and WMF affiliation:\")\n",
|
|
"print(speaker_counts_wmf)\n",
|
|
"\n",
|
|
"#comment_phab_df['before_after'] = comment_phab_df['timestamp'] > pd.Timestamp('2013-07-01 00:00:01+00:00')\n",
|
|
"#fig, axes = plt.subplots(2, 1, figsize=(10, 12), sharex=True)\n",
|
|
"affective_comment_phab_df['polarized_wc'] = affective_comment_phab_df['dominant_wc'] + affective_comment_phab_df['valence_wc'] + affective_comment_phab_df['arousal_wc'] \n",
|
|
"plot1 = sns.lmplot(data=affective_comment_phab_df, x=\"date_created\", y=\"polarized_wc\", hue=\"date_group\", col=\"meta.affil\", row='est_commenter', scatter=False, legend=False, palette=palette)\n",
|
|
"plot1.set_axis_labels(\"Timestamp\", \"Count of Polarized Words\")\n",
|
|
"plot1.set_titles(row_template=\"Established Author: {row_name}\", col_template=\"WMF Affiliation: {col_name}\")\n",
|
|
"plot1.fig.subplots_adjust(top=0.9) # Adjust subplots to make room for the title\n",
|
|
"plot1.add_legend(title=\"Comment publication timestamp:\")\n",
|
|
"fig1 = plot1.fig\n",
|
|
"'''\n",
|
|
"plot1 = sns.lmplot(data=comment_phab_df, x=\"date_created\", y=\"dominant_wc\", hue=\"date_group\", col=\"meta.affil\", row='new_commenter', scatter=False, legend=False, palette=palette)\n",
|
|
"plot1.set_axis_labels(\"Timestamp\", \"Count of Dominance Polarized Words\")\n",
|
|
"plot1.set_titles(row_template=\"Author's 100+ Comment: {row_name}\",col_template=\"WMF Affiliation: {col_name}\")\n",
|
|
"plot1.fig.subplots_adjust(top=0.9) # Adjust subplots to make room for the title\n",
|
|
"plot1.add_legend(title=\"Comment publication timestamp:\")\n",
|
|
"fig1 = plot1.fig\n",
|
|
"# Plot for arousal_wc\n",
|
|
"plot2 = sns.lmplot(data=comment_phab_df, x=\"date_created\", y=\"arousal_wc\", hue=\"date_group\", col=\"meta.affil\", row='engaged_commenter', scatter=False, legend=False, palette=palette)\n",
|
|
"plot2.set_axis_labels(\"Timestamp\", \"Count of Arousal Polarized Words\")\n",
|
|
"plot2.set_titles(row_template=\"Author's 100+ Comment: {row_name}\",col_template=\"WMF Affiliation: {col_name}\")\n",
|
|
"plot2.add_legend(title=\"Comment publication timestamp:\")\n",
|
|
"#plot2.add_legend(title=\"Before/After 07/01/2013 Wide Release\")\n",
|
|
"\n",
|
|
"plot3 = sns.lmplot(data=comment_phab_df, x=\"date_created\", y=\"valence_wc\", hue=\"date_group\", col=\"meta.affil\", row='engaged_commenter', scatter=False, legend=False, palette=palette)\n",
|
|
"plot3.set_axis_labels(\"Timestamp\", \"Count of Valence Polarized Words\")\n",
|
|
"plot3.set_titles(row_template=\"Author's 100+ Comment: {row_name}\",col_template=\"WMF Affiliation: {col_name}\")\n",
|
|
"plot3.add_legend(title=\"Comment publication timestamp:\")\n",
|
|
"'''\n",
|
|
"# Show plots\n",
|
|
"#fig1.savefig('031725_engaged_commenter_D_scoring_fig.png')\n",
|
|
"#plot2.fig.savefig('031725_engaged_commenter_A_scoring_fig.png')\n",
|
|
"#plot3.fig.savefig('031725_engaged_commenter_V_scoring_fig.png')\n",
|
|
"#plt.savefig('031625_engaged_commenter_VAD_scoring_fig.png')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"id": "5a91a59a-0d1c-48b3-93dd-b9df76ca68e5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 1000x500 with 2 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"sns.set_context(\"talk\", font_scale=1.2)\n",
|
|
"plot2 = sns.lmplot(data=affective_comment_phab_df, x=\"speakers_comment\", y=\"polarized_wc\", hue=\"date_group\", col=\"meta.affil\", scatter=False, legend=False, palette=palette)\n",
|
|
"plot2.set_axis_labels(\"Comment Index\", \"Count of Polarized Words\")\n",
|
|
"plot2.set_titles(col_template=\"WMF Affiliation: {col_name}\")\n",
|
|
"plot2.fig.subplots_adjust(top=0.9) # Adjust subplots to make room for the title\n",
|
|
"#plot2.add_legend(title=\"Comment publication timestamp:\")\n",
|
|
"plot2.fig.savefig('c3-050125_affective_language_use-slides.png')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d2d67d38-f005-4c94-be3c-39eb6b22686f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#pattern = r'\\b(ve|VE|visualeditor|VisualEditor)\\b'\n",
|
|
"#pattern = r'\\b(WMF|Foundation)\\b'\n",
|
|
"#pattern = r'\\b(bots)\\b'\n",
|
|
"filtered_dependencies = dependency_relations_df[dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
|
"resolved_filtered_dependencies = resolved_dependency_relations_df[resolved_dependency_relations_df['token'].str.contains(pattern, regex=True)]\n",
|
|
"\n",
|
|
"plt.figure(figsize=(12, 8))\n",
|
|
"gs = GridSpec(2, 1, height_ratios=[6, 6])\n",
|
|
"\n",
|
|
"# Main plot: Token depth by timestamp\n",
|
|
"'''\n",
|
|
"ax0 = plt.subplot(gs[0])\n",
|
|
"sns.scatterplot(data=filtered_dependencies, x='timestamp', y='dependency', hue='wmfAffil', style='dependency', markers=True, s=100, ax=ax0)\n",
|
|
"ax0.set_title('VE Depth by Timestamp w/o URLS')\n",
|
|
"ax0.set_xlabel('')\n",
|
|
"ax0.set_ylabel('Dependency Type')\n",
|
|
"ax0.legend().set_visible(False)\n",
|
|
"'''\n",
|
|
"# Calculate the median depth over time\n",
|
|
"filtered_dependencies['week'] = filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
|
"median_depth = filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
|
"\n",
|
|
"wmf_filtered_dependencies = filtered_dependencies[filtered_dependencies['wmfAffil'] == True]\n",
|
|
"wmf_median_depth = wmf_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
|
"\n",
|
|
"other_filtered_dependencies = filtered_dependencies[filtered_dependencies['wmfAffil'] != True]\n",
|
|
"other_median_depth = other_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
|
"\n",
|
|
"# Plot the median depth over time\n",
|
|
"ax0 = plt.subplot(gs[0])\n",
|
|
"sns.lineplot(data=median_depth, x='week', y='depth', ax=ax0, color='black', label='Median Depth', marker='o')\n",
|
|
"sns.lineplot(data=wmf_median_depth, x='week', y='depth', ax=ax0, color='#c7756a', label='WMF-affiliated authors', marker='x')\n",
|
|
"sns.lineplot(data=other_median_depth, x='week', y='depth', ax=ax0, color='#5da2d8', label='Nonaffiliated authors', marker='x')\n",
|
|
"ax0.set_title('Median Depth of \"VE\" in Phabricator Sentence Dependency Trees')\n",
|
|
"ax0.set_ylabel('Median Depth')\n",
|
|
"ax0.set_xlabel('')\n",
|
|
"\n",
|
|
"# Calculate the median depth over time\n",
|
|
"resolved_filtered_dependencies['week'] = resolved_filtered_dependencies['timestamp'].dt.to_period('W').dt.start_time\n",
|
|
"resolved_median_depth = resolved_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
|
"\n",
|
|
"resolved_wmf_filtered_dependencies = resolved_filtered_dependencies[resolved_filtered_dependencies['wmfAffil'] == True]\n",
|
|
"resolved_wmf_median_depth = resolved_wmf_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
|
"\n",
|
|
"resolved_other_filtered_dependencies = resolved_filtered_dependencies[resolved_filtered_dependencies['wmfAffil'] != True]\n",
|
|
"resolved_other_median_depth = resolved_other_filtered_dependencies.groupby('week')['depth'].median().reset_index()\n",
|
|
"\n",
|
|
"# Plot the median depth over time\n",
|
|
"ax1 = plt.subplot(gs[1])\n",
|
|
"sns.lineplot(data=resolved_median_depth, x='week', y='depth', ax=ax1, color='black', label='Median Depth', marker='o')\n",
|
|
"sns.lineplot(data=resolved_wmf_median_depth, x='week', y='depth', ax=ax1, color='#c7756a', label='WMF-affiliated authors', marker='x')\n",
|
|
"sns.lineplot(data=resolved_other_median_depth, x='week', y='depth', ax=ax1, color='#5da2d8', label='Nonaffiliated authors', marker='x')\n",
|
|
"ax1.set_title('Median Depth of \"VE\" in Coreference-resolved Phabricator Sentence Dependency Trees')\n",
|
|
"ax1.set_ylabel('Median Depth')\n",
|
|
"ax1.set_xlabel('')\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"#plt.show()\n",
|
|
"\n",
|
|
"#plt.savefig('031625_VE_depth_fig.png')"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|