1
0
mw-lifecycle-analysis/phab_analysis/case3/coref_resolution-http.ipynb
Matthew Gaughan 3573afbc1a reorganizing
2025-05-18 16:50:20 -07:00

780 lines
31 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import pandas as pd \n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0422_http_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e30e81ad",
"metadata": {},
"outputs": [],
"source": [
"#because of compute issues, need to do the sampling before the coreference resolution\n",
"def http_relevant(text):\n",
" if pd.isnull(text):\n",
" return False\n",
" # expanded dictionary for relevancy\n",
" # http, login, SSL, TLS, certificate \n",
" for word in text.split():\n",
" if \"://\" not in word.lower():\n",
" #http\n",
" if \"http\" in word.lower():\n",
" return True\n",
" #login\n",
" if \"login\" in word.lower():\n",
" return True\n",
" #ssl\n",
" if \"ssl\" in word.lower():\n",
" return True\n",
" #tls\n",
" if \"tls\" in word.lower():\n",
" return True\n",
" #cert\n",
" if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n",
" return True\n",
" return False\n",
"\n",
"def is_migrated(comment_text):\n",
" if pd.isnull(comment_text):\n",
" return False\n",
" text = comment_text.strip()\n",
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
" return True \n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f359805f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:45: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
}
],
"source": [
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
"\n",
"#cleaning df\n",
"phab_df['id'] = phab_df.index + 1\n",
"#may have to build out the reply_to column \n",
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
"\n",
"phab_df = phab_df.rename(columns={\n",
" 'AuthorPHID': 'speaker',\n",
" 'TaskPHID': 'conversation_id',\n",
" 'WMFaffil':'meta.affil',\n",
" 'isGerrit': 'meta.gerrit'\n",
"})\n",
"\n",
"# after 10-01-2014 before 10-01-2015\n",
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] >= 1412207999)]\n",
"# after 07-01-2013 before 10-01-2015\n",
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] > 1372636800)]\n",
"\n",
"#removing headless conversations\n",
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
"\n",
"#removing gerrit comments \n",
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
"\n",
"# filter out the sourceforge migration \n",
"# Originally from: http://sourceforge.net in the task task_summary\n",
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
"\n",
"#cut down to only the data that is relevant (mentions http)\n",
"relevant_conversation_ids = task_phab_df[\n",
" task_phab_df['comment_text'].apply(http_relevant) |\n",
" task_phab_df['task_title'].apply(http_relevant)\n",
"]['conversation_id'].unique()\n",
"\n",
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"\n",
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"\n",
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
"#comment_phab_df = mid_comment_phab_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4241cb0a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>task_title</th>\n",
" <th>comment_text</th>\n",
" <th>date_created</th>\n",
" <th>speaker</th>\n",
" <th>meta.affil</th>\n",
" <th>conversation_id</th>\n",
" <th>comment_type</th>\n",
" <th>status</th>\n",
" <th>meta.gerrit</th>\n",
" <th>id</th>\n",
" <th>reply_to</th>\n",
" <th>timestamp</th>\n",
" <th>is_relevant</th>\n",
" <th>is_migrated</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>After a replace of old instances, it is not po...</td>\n",
" <td>1442753295</td>\n",
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_description</td>\n",
" <td>resolved</td>\n",
" <td>False</td>\n",
" <td>198</td>\n",
" <td>NaN</td>\n",
" <td>2015-09-20 12:48:15+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>Works now.</td>\n",
" <td>1442864673</td>\n",
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>199</td>\n",
" <td>198.0</td>\n",
" <td>2015-09-21 19:44:33+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>Ok, the instances are deleted now, I will recr...</td>\n",
" <td>1442864271</td>\n",
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>200</td>\n",
" <td>199.0</td>\n",
" <td>2015-09-21 19:37:51+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>200</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>The new instances have the same names as recen...</td>\n",
" <td>1442854156</td>\n",
" <td>PHID-USER-22bsa5u75jz3ci3wnplu</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>201</td>\n",
" <td>200.0</td>\n",
" <td>2015-09-21 16:49:16+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>201</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>This happens also with jessie and presice inst...</td>\n",
" <td>1442835238</td>\n",
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>202</td>\n",
" <td>201.0</td>\n",
" <td>2015-09-21 11:33:58+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406887</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>*** Bug 49454 has been marked as a duplicate o...</td>\n",
" <td>1379011061</td>\n",
" <td>PHID-USER-2nnm76h4ykalvvref2ye</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406888</td>\n",
" <td>406887.0</td>\n",
" <td>2013-09-12 18:37:41+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406888</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>JFTR, on Tools mosh-server processes eat up to...</td>\n",
" <td>1376245807</td>\n",
" <td>PHID-USER-vk6mlmacfhx77egryy5i</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406889</td>\n",
" <td>406888.0</td>\n",
" <td>2013-08-11 18:30:07+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406889</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>This is supported on tools, but adding it to t...</td>\n",
" <td>1376185312</td>\n",
" <td>PHID-USER-h75guknmwivm6x37iute</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406890</td>\n",
" <td>406889.0</td>\n",
" <td>2013-08-11 01:41:52+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406890</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>Just found out that mosh already works for too...</td>\n",
" <td>1376118400</td>\n",
" <td>PHID-USER-5dqihbanu3caaj7pigif</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406891</td>\n",
" <td>406890.0</td>\n",
" <td>2013-08-10 07:06:40+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406891</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>(In reply to comment #0)\\n&gt; ssh is quite painf...</td>\n",
" <td>1376118251</td>\n",
" <td>PHID-USER-6vzzsmi22zem6yttr6vp</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406892</td>\n",
" <td>406891.0</td>\n",
" <td>2013-08-10 07:04:11+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>14490 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" task_title \\\n",
"197 Creation of instances broken \n",
"198 Creation of instances broken \n",
"199 Creation of instances broken \n",
"200 Creation of instances broken \n",
"201 Creation of instances broken \n",
"... ... \n",
"406887 Allow login using mosh as an alternative to pl... \n",
"406888 Allow login using mosh as an alternative to pl... \n",
"406889 Allow login using mosh as an alternative to pl... \n",
"406890 Allow login using mosh as an alternative to pl... \n",
"406891 Allow login using mosh as an alternative to pl... \n",
"\n",
" comment_text date_created \\\n",
"197 After a replace of old instances, it is not po... 1442753295 \n",
"198 Works now. 1442864673 \n",
"199 Ok, the instances are deleted now, I will recr... 1442864271 \n",
"200 The new instances have the same names as recen... 1442854156 \n",
"201 This happens also with jessie and presice inst... 1442835238 \n",
"... ... ... \n",
"406887 *** Bug 49454 has been marked as a duplicate o... 1379011061 \n",
"406888 JFTR, on Tools mosh-server processes eat up to... 1376245807 \n",
"406889 This is supported on tools, but adding it to t... 1376185312 \n",
"406890 Just found out that mosh already works for too... 1376118400 \n",
"406891 (In reply to comment #0)\\n> ssh is quite painf... 1376118251 \n",
"\n",
" speaker meta.affil \\\n",
"197 PHID-USER-qlodcndtwpolbdhncjis False \n",
"198 PHID-USER-qlodcndtwpolbdhncjis False \n",
"199 PHID-USER-qlodcndtwpolbdhncjis False \n",
"200 PHID-USER-22bsa5u75jz3ci3wnplu False \n",
"201 PHID-USER-qlodcndtwpolbdhncjis False \n",
"... ... ... \n",
"406887 PHID-USER-2nnm76h4ykalvvref2ye False \n",
"406888 PHID-USER-vk6mlmacfhx77egryy5i False \n",
"406889 PHID-USER-h75guknmwivm6x37iute False \n",
"406890 PHID-USER-5dqihbanu3caaj7pigif False \n",
"406891 PHID-USER-6vzzsmi22zem6yttr6vp False \n",
"\n",
" conversation_id comment_type status \\\n",
"197 PHID-TASK-pitdrld6mszruqmc6usf task_description resolved \n",
"198 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
"199 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
"200 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
"201 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
"... ... ... ... \n",
"406887 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"406888 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"406889 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"406890 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"406891 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"\n",
" meta.gerrit id reply_to timestamp is_relevant \\\n",
"197 False 198 NaN 2015-09-20 12:48:15+00:00 True \n",
"198 False 199 198.0 2015-09-21 19:44:33+00:00 True \n",
"199 False 200 199.0 2015-09-21 19:37:51+00:00 True \n",
"200 False 201 200.0 2015-09-21 16:49:16+00:00 True \n",
"201 False 202 201.0 2015-09-21 11:33:58+00:00 True \n",
"... ... ... ... ... ... \n",
"406887 False 406888 406887.0 2013-09-12 18:37:41+00:00 True \n",
"406888 False 406889 406888.0 2013-08-11 18:30:07+00:00 True \n",
"406889 False 406890 406889.0 2013-08-11 01:41:52+00:00 True \n",
"406890 False 406891 406890.0 2013-08-10 07:06:40+00:00 True \n",
"406891 False 406892 406891.0 2013-08-10 07:04:11+00:00 True \n",
"\n",
" is_migrated \n",
"197 False \n",
"198 False \n",
"199 False \n",
"200 False \n",
"201 False \n",
"... ... \n",
"406887 False \n",
"406888 False \n",
"406889 False \n",
"406890 False \n",
"406891 False \n",
"\n",
"[14490 rows x 14 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"comment_phab_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "930c4d9c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
},
{
"data": {
"text/plain": [
"862"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prior_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0050825_coref-rel-first.csv\"\n",
"prior_df = pd.read_csv(prior_path)\n",
"comment_phab_df['timestamp'] = pd.to_datetime(comment_phab_df['timestamp'], utc=True)\n",
"prior_df['timestamp'] = pd.to_datetime(prior_df['timestamp'], utc=True)\n",
"merged_df = comment_phab_df.merge(prior_df, how='outer', indicator=True)\n",
"len(merged_df)\n",
"only_in_comment_phab_df = merged_df[merged_df['_merge'] == 'left_only']\n",
"len(only_in_comment_phab_df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x154d9952a7c0>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp = spacy.load(\"en_core_web_trf\")\n",
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
"\n",
"# use replace_listeners for the coref components\n",
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
"\n",
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
"nlp.add_pipe(\"merge_entities\")\n",
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/explosion/spaCy/discussions/13572\n",
"# https://github.com/explosion/spaCy/issues/13111 \n",
"# https://explosion.ai/blog/coref\n",
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "424d35e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
"source": [
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"from spacy.tokens import Doc\n",
"# Define lightweight function for resolving references in text\n",
"def resolve_references(doc: Doc) -> str:\n",
" \"\"\"Function for resolving references with the coref ouput\n",
" doc (Doc): The Doc object processed by the coref pipeline\n",
" RETURNS (str): The Doc string with resolved references\n",
" \"\"\"\n",
" # token.idx : token.text\n",
" token_mention_mapper = {}\n",
" output_string = \"\"\n",
" clusters = [\n",
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
" ]\n",
"\n",
" # Iterate through every found cluster\n",
" for cluster in clusters:\n",
" first_mention = cluster[0]\n",
" # Iterate through every other span in the cluster\n",
" for mention_span in list(cluster)[1:]:\n",
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
" \n",
" for token in mention_span[1:]:\n",
" # Set empty string for all the other tokens in mention_span\n",
" token_mention_mapper[token.idx] = \"\"\n",
"\n",
" # Iterate through every token in the Doc\n",
" for token in doc:\n",
" # Check if token exists in token_mention_mapper\n",
" if token.idx in token_mention_mapper:\n",
" output_string += token_mention_mapper[token.idx]\n",
" # Else add original token text\n",
" else:\n",
" output_string += token.text + token.whitespace_\n",
"\n",
" return output_string\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
"source": [
"def resolving_comment(text):\n",
" doc = nlp(text)\n",
" resolved_text = resolve_references(doc)\n",
" return resolved_text"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
}
],
"source": [
"only_in_comment_phab_df['text'] = only_in_comment_phab_df['comment_text'].apply(str)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "79e3f7e2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
}
],
"source": [
"only_in_comment_phab_df['resolved_text'] = only_in_comment_phab_df['text'].apply(resolving_comment)\n",
"only_in_comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"7423 [Backport was merged into 1.24wmf16 upon a tim...\n",
"7902 I guess this can be closed now as RESOLVED WOR...\n",
"7905 The upstream issue is https://github.com/jcgre...\n",
"7906 An update on this. In Amsterdam we found at th...\n",
"7907 Yes. It's used by people using pywikibot-as-a-...\n",
" ... \n",
"14465 I amended the title to the range IE8-10 becaus...\n",
"14466 If I remember correctly this problem was at le...\n",
"14467 If I remember correctly this problem was at le...\n",
"14468 After a quick test, autocomplete seems to work...\n",
"14478 Still not merged, so we can't really do much.\n",
"Name: resolved_text, Length: 862, dtype: object"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"only_in_comment_phab_df['resolved_text']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92bf47ae",
"metadata": {},
"outputs": [],
"source": [
"only_in_comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}