780 lines
31 KiB
Plaintext
780 lines
31 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
|
||
" warnings.warn(\"Can't initialize NVML\")\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd \n",
|
||
"import spacy"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0422_http_phab_comments.csv\"\n",
|
||
"phab_df = pd.read_csv(phab_path)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "e30e81ad",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#because of compute issues, need to do the sampling before the coreference resolution\n",
|
||
"def http_relevant(text):\n",
|
||
" if pd.isnull(text):\n",
|
||
" return False\n",
|
||
" # expanded dictionary for relevancy\n",
|
||
" # http, login, SSL, TLS, certificate \n",
|
||
" for word in text.split():\n",
|
||
" if \"://\" not in word.lower():\n",
|
||
" #http\n",
|
||
" if \"http\" in word.lower():\n",
|
||
" return True\n",
|
||
" #login\n",
|
||
" if \"login\" in word.lower():\n",
|
||
" return True\n",
|
||
" #ssl\n",
|
||
" if \"ssl\" in word.lower():\n",
|
||
" return True\n",
|
||
" #tls\n",
|
||
" if \"tls\" in word.lower():\n",
|
||
" return True\n",
|
||
" #cert\n",
|
||
" if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n",
|
||
" return True\n",
|
||
" return False\n",
|
||
"\n",
|
||
"def is_migrated(comment_text):\n",
|
||
" if pd.isnull(comment_text):\n",
|
||
" return False\n",
|
||
" text = comment_text.strip()\n",
|
||
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
|
||
" return True \n",
|
||
" return False"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "f359805f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:45: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
|
||
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
|
||
"\n",
|
||
"#cleaning df\n",
|
||
"phab_df['id'] = phab_df.index + 1\n",
|
||
"#may have to build out the reply_to column \n",
|
||
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
|
||
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
|
||
"\n",
|
||
"phab_df = phab_df.rename(columns={\n",
|
||
" 'AuthorPHID': 'speaker',\n",
|
||
" 'TaskPHID': 'conversation_id',\n",
|
||
" 'WMFaffil':'meta.affil',\n",
|
||
" 'isGerrit': 'meta.gerrit'\n",
|
||
"})\n",
|
||
"\n",
|
||
"# after 10-01-2014 before 10-01-2015\n",
|
||
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
|
||
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] >= 1412207999)]\n",
|
||
"# after 07-01-2013 before 10-01-2015\n",
|
||
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] > 1372636800)]\n",
|
||
"\n",
|
||
"#removing headless conversations\n",
|
||
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
|
||
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
|
||
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
|
||
"\n",
|
||
"#removing gerrit comments \n",
|
||
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
|
||
"\n",
|
||
"# filter out the sourceforge migration \n",
|
||
"# Originally from: http://sourceforge.net in the task task_summary\n",
|
||
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
|
||
"\n",
|
||
"#cut down to only the data that is relevant (mentions http)\n",
|
||
"relevant_conversation_ids = task_phab_df[\n",
|
||
" task_phab_df['comment_text'].apply(http_relevant) |\n",
|
||
" task_phab_df['task_title'].apply(http_relevant)\n",
|
||
"]['conversation_id'].unique()\n",
|
||
"\n",
|
||
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
||
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
||
"\n",
|
||
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
||
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
||
"\n",
|
||
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
|
||
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
|
||
"#comment_phab_df = mid_comment_phab_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "4241cb0a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>task_title</th>\n",
|
||
" <th>comment_text</th>\n",
|
||
" <th>date_created</th>\n",
|
||
" <th>speaker</th>\n",
|
||
" <th>meta.affil</th>\n",
|
||
" <th>conversation_id</th>\n",
|
||
" <th>comment_type</th>\n",
|
||
" <th>status</th>\n",
|
||
" <th>meta.gerrit</th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>reply_to</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>is_relevant</th>\n",
|
||
" <th>is_migrated</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>197</th>\n",
|
||
" <td>Creation of instances broken</td>\n",
|
||
" <td>After a replace of old instances, it is not po...</td>\n",
|
||
" <td>1442753295</td>\n",
|
||
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||
" <td>task_description</td>\n",
|
||
" <td>resolved</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>198</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2015-09-20 12:48:15+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>198</th>\n",
|
||
" <td>Creation of instances broken</td>\n",
|
||
" <td>Works now.</td>\n",
|
||
" <td>1442864673</td>\n",
|
||
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>199</td>\n",
|
||
" <td>198.0</td>\n",
|
||
" <td>2015-09-21 19:44:33+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>199</th>\n",
|
||
" <td>Creation of instances broken</td>\n",
|
||
" <td>Ok, the instances are deleted now, I will recr...</td>\n",
|
||
" <td>1442864271</td>\n",
|
||
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>200</td>\n",
|
||
" <td>199.0</td>\n",
|
||
" <td>2015-09-21 19:37:51+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>200</th>\n",
|
||
" <td>Creation of instances broken</td>\n",
|
||
" <td>The new instances have the same names as recen...</td>\n",
|
||
" <td>1442854156</td>\n",
|
||
" <td>PHID-USER-22bsa5u75jz3ci3wnplu</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>201</td>\n",
|
||
" <td>200.0</td>\n",
|
||
" <td>2015-09-21 16:49:16+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>201</th>\n",
|
||
" <td>Creation of instances broken</td>\n",
|
||
" <td>This happens also with jessie and presice inst...</td>\n",
|
||
" <td>1442835238</td>\n",
|
||
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>202</td>\n",
|
||
" <td>201.0</td>\n",
|
||
" <td>2015-09-21 11:33:58+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>406887</th>\n",
|
||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||
" <td>*** Bug 49454 has been marked as a duplicate o...</td>\n",
|
||
" <td>1379011061</td>\n",
|
||
" <td>PHID-USER-2nnm76h4ykalvvref2ye</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>406888</td>\n",
|
||
" <td>406887.0</td>\n",
|
||
" <td>2013-09-12 18:37:41+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>406888</th>\n",
|
||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||
" <td>JFTR, on Tools mosh-server processes eat up to...</td>\n",
|
||
" <td>1376245807</td>\n",
|
||
" <td>PHID-USER-vk6mlmacfhx77egryy5i</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>406889</td>\n",
|
||
" <td>406888.0</td>\n",
|
||
" <td>2013-08-11 18:30:07+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>406889</th>\n",
|
||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||
" <td>This is supported on tools, but adding it to t...</td>\n",
|
||
" <td>1376185312</td>\n",
|
||
" <td>PHID-USER-h75guknmwivm6x37iute</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>406890</td>\n",
|
||
" <td>406889.0</td>\n",
|
||
" <td>2013-08-11 01:41:52+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>406890</th>\n",
|
||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||
" <td>Just found out that mosh already works for too...</td>\n",
|
||
" <td>1376118400</td>\n",
|
||
" <td>PHID-USER-5dqihbanu3caaj7pigif</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>406891</td>\n",
|
||
" <td>406890.0</td>\n",
|
||
" <td>2013-08-10 07:06:40+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>406891</th>\n",
|
||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||
" <td>(In reply to comment #0)\\n> ssh is quite painf...</td>\n",
|
||
" <td>1376118251</td>\n",
|
||
" <td>PHID-USER-6vzzsmi22zem6yttr6vp</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||
" <td>task_subcomment</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>406892</td>\n",
|
||
" <td>406891.0</td>\n",
|
||
" <td>2013-08-10 07:04:11+00:00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>14490 rows × 14 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" task_title \\\n",
|
||
"197 Creation of instances broken \n",
|
||
"198 Creation of instances broken \n",
|
||
"199 Creation of instances broken \n",
|
||
"200 Creation of instances broken \n",
|
||
"201 Creation of instances broken \n",
|
||
"... ... \n",
|
||
"406887 Allow login using mosh as an alternative to pl... \n",
|
||
"406888 Allow login using mosh as an alternative to pl... \n",
|
||
"406889 Allow login using mosh as an alternative to pl... \n",
|
||
"406890 Allow login using mosh as an alternative to pl... \n",
|
||
"406891 Allow login using mosh as an alternative to pl... \n",
|
||
"\n",
|
||
" comment_text date_created \\\n",
|
||
"197 After a replace of old instances, it is not po... 1442753295 \n",
|
||
"198 Works now. 1442864673 \n",
|
||
"199 Ok, the instances are deleted now, I will recr... 1442864271 \n",
|
||
"200 The new instances have the same names as recen... 1442854156 \n",
|
||
"201 This happens also with jessie and presice inst... 1442835238 \n",
|
||
"... ... ... \n",
|
||
"406887 *** Bug 49454 has been marked as a duplicate o... 1379011061 \n",
|
||
"406888 JFTR, on Tools mosh-server processes eat up to... 1376245807 \n",
|
||
"406889 This is supported on tools, but adding it to t... 1376185312 \n",
|
||
"406890 Just found out that mosh already works for too... 1376118400 \n",
|
||
"406891 (In reply to comment #0)\\n> ssh is quite painf... 1376118251 \n",
|
||
"\n",
|
||
" speaker meta.affil \\\n",
|
||
"197 PHID-USER-qlodcndtwpolbdhncjis False \n",
|
||
"198 PHID-USER-qlodcndtwpolbdhncjis False \n",
|
||
"199 PHID-USER-qlodcndtwpolbdhncjis False \n",
|
||
"200 PHID-USER-22bsa5u75jz3ci3wnplu False \n",
|
||
"201 PHID-USER-qlodcndtwpolbdhncjis False \n",
|
||
"... ... ... \n",
|
||
"406887 PHID-USER-2nnm76h4ykalvvref2ye False \n",
|
||
"406888 PHID-USER-vk6mlmacfhx77egryy5i False \n",
|
||
"406889 PHID-USER-h75guknmwivm6x37iute False \n",
|
||
"406890 PHID-USER-5dqihbanu3caaj7pigif False \n",
|
||
"406891 PHID-USER-6vzzsmi22zem6yttr6vp False \n",
|
||
"\n",
|
||
" conversation_id comment_type status \\\n",
|
||
"197 PHID-TASK-pitdrld6mszruqmc6usf task_description resolved \n",
|
||
"198 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
|
||
"199 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
|
||
"200 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
|
||
"201 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
|
||
"... ... ... ... \n",
|
||
"406887 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||
"406888 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||
"406889 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||
"406890 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||
"406891 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||
"\n",
|
||
" meta.gerrit id reply_to timestamp is_relevant \\\n",
|
||
"197 False 198 NaN 2015-09-20 12:48:15+00:00 True \n",
|
||
"198 False 199 198.0 2015-09-21 19:44:33+00:00 True \n",
|
||
"199 False 200 199.0 2015-09-21 19:37:51+00:00 True \n",
|
||
"200 False 201 200.0 2015-09-21 16:49:16+00:00 True \n",
|
||
"201 False 202 201.0 2015-09-21 11:33:58+00:00 True \n",
|
||
"... ... ... ... ... ... \n",
|
||
"406887 False 406888 406887.0 2013-09-12 18:37:41+00:00 True \n",
|
||
"406888 False 406889 406888.0 2013-08-11 18:30:07+00:00 True \n",
|
||
"406889 False 406890 406889.0 2013-08-11 01:41:52+00:00 True \n",
|
||
"406890 False 406891 406890.0 2013-08-10 07:06:40+00:00 True \n",
|
||
"406891 False 406892 406891.0 2013-08-10 07:04:11+00:00 True \n",
|
||
"\n",
|
||
" is_migrated \n",
|
||
"197 False \n",
|
||
"198 False \n",
|
||
"199 False \n",
|
||
"200 False \n",
|
||
"201 False \n",
|
||
"... ... \n",
|
||
"406887 False \n",
|
||
"406888 False \n",
|
||
"406889 False \n",
|
||
"406890 False \n",
|
||
"406891 False \n",
|
||
"\n",
|
||
"[14490 rows x 14 columns]"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"comment_phab_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "930c4d9c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" This is separate from the ipykernel package so we can avoid doing imports until\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"862"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"prior_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0050825_coref-rel-first.csv\"\n",
|
||
"prior_df = pd.read_csv(prior_path)\n",
|
||
"comment_phab_df['timestamp'] = pd.to_datetime(comment_phab_df['timestamp'], utc=True)\n",
|
||
"prior_df['timestamp'] = pd.to_datetime(prior_df['timestamp'], utc=True)\n",
|
||
"merged_df = comment_phab_df.merge(prior_df, how='outer', indicator=True)\n",
|
||
"len(merged_df)\n",
|
||
"only_in_comment_phab_df = merged_df[merged_df['_merge'] == 'left_only']\n",
|
||
"len(only_in_comment_phab_df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x154d9952a7c0>"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"nlp = spacy.load(\"en_core_web_trf\")\n",
|
||
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
|
||
"\n",
|
||
"# use replace_listeners for the coref components\n",
|
||
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
|
||
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
|
||
"\n",
|
||
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
|
||
"nlp.add_pipe(\"merge_entities\")\n",
|
||
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
|
||
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# https://github.com/explosion/spaCy/discussions/13572\n",
|
||
"# https://github.com/explosion/spaCy/issues/13111 \n",
|
||
"# https://explosion.ai/blog/coref\n",
|
||
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
|
||
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "424d35e0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"doc"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
|
||
"from spacy.tokens import Doc\n",
|
||
"# Define lightweight function for resolving references in text\n",
|
||
"def resolve_references(doc: Doc) -> str:\n",
|
||
" \"\"\"Function for resolving references with the coref ouput\n",
|
||
" doc (Doc): The Doc object processed by the coref pipeline\n",
|
||
" RETURNS (str): The Doc string with resolved references\n",
|
||
" \"\"\"\n",
|
||
" # token.idx : token.text\n",
|
||
" token_mention_mapper = {}\n",
|
||
" output_string = \"\"\n",
|
||
" clusters = [\n",
|
||
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
|
||
" ]\n",
|
||
"\n",
|
||
" # Iterate through every found cluster\n",
|
||
" for cluster in clusters:\n",
|
||
" first_mention = cluster[0]\n",
|
||
" # Iterate through every other span in the cluster\n",
|
||
" for mention_span in list(cluster)[1:]:\n",
|
||
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
|
||
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
|
||
" \n",
|
||
" for token in mention_span[1:]:\n",
|
||
" # Set empty string for all the other tokens in mention_span\n",
|
||
" token_mention_mapper[token.idx] = \"\"\n",
|
||
"\n",
|
||
" # Iterate through every token in the Doc\n",
|
||
" for token in doc:\n",
|
||
" # Check if token exists in token_mention_mapper\n",
|
||
" if token.idx in token_mention_mapper:\n",
|
||
" output_string += token_mention_mapper[token.idx]\n",
|
||
" # Else add original token text\n",
|
||
" else:\n",
|
||
" output_string += token.text + token.whitespace_\n",
|
||
"\n",
|
||
" return output_string\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def resolving_comment(text):\n",
|
||
" doc = nlp(text)\n",
|
||
" resolved_text = resolve_references(doc)\n",
|
||
" return resolved_text"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'i hate ve.interface, ve.interface always messes up i browser'"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" \"\"\"Entry point for launching an IPython kernel.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"only_in_comment_phab_df['text'] = only_in_comment_phab_df['comment_text'].apply(str)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "79e3f7e2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors\n",
|
||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" \"\"\"Entry point for launching an IPython kernel.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"only_in_comment_phab_df['resolved_text'] = only_in_comment_phab_df['text'].apply(resolving_comment)\n",
|
||
"only_in_comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"7423 [Backport was merged into 1.24wmf16 upon a tim...\n",
|
||
"7902 I guess this can be closed now as RESOLVED WOR...\n",
|
||
"7905 The upstream issue is https://github.com/jcgre...\n",
|
||
"7906 An update on this. In Amsterdam we found at th...\n",
|
||
"7907 Yes. It's used by people using pywikibot-as-a-...\n",
|
||
" ... \n",
|
||
"14465 I amended the title to the range IE8-10 becaus...\n",
|
||
"14466 If I remember correctly this problem was at le...\n",
|
||
"14467 If I remember correctly this problem was at le...\n",
|
||
"14468 After a quick test, autocomplete seems to work...\n",
|
||
"14478 Still not merged, so we can't really do much.\n",
|
||
"Name: resolved_text, Length: 862, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 33,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"only_in_comment_phab_df['resolved_text']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "92bf47ae",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"only_in_comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv\", index=False)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|