1
0

reorganizing

This commit is contained in:
Matthew Gaughan 2025-05-18 16:50:20 -07:00
parent ee31544b15
commit 3573afbc1a
52 changed files with 1085 additions and 37182 deletions

7
.gitignore vendored
View File

@ -1,10 +1,15 @@
# ignore the R studio docker image needed by hyak
rstudio_latest.sif
rstudio-server.job
# do not need to include any R items
.Rhistory
.cache/
.config/
.local/
.RData
#can leave out misc tooling
.sh_history
.ipynb_checkpoints

View File

Before

Width:  |  Height:  |  Size: 102 KiB

After

Width:  |  Height:  |  Size: 102 KiB

View File

Before

Width:  |  Height:  |  Size: 69 KiB

After

Width:  |  Height:  |  Size: 69 KiB

View File

Before

Width:  |  Height:  |  Size: 145 KiB

After

Width:  |  Height:  |  Size: 145 KiB

View File

Before

Width:  |  Height:  |  Size: 77 KiB

After

Width:  |  Height:  |  Size: 77 KiB

View File

Before

Width:  |  Height:  |  Size: 92 KiB

After

Width:  |  Height:  |  Size: 92 KiB

View File

Before

Width:  |  Height:  |  Size: 46 KiB

After

Width:  |  Height:  |  Size: 46 KiB

File diff suppressed because one or more lines are too long

View File

@ -10,9 +10,9 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
@ -29,7 +29,7 @@
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/0402_https1_phab_comments.csv\"\n",
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/0514_https_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
@ -40,7 +40,6 @@
"metadata": {},
"outputs": [],
"source": [
"#because of compute issues, need to do the sampling before the coreference resolution\n",
"def http_relevant(text):\n",
" if pd.isnull(text):\n",
" return False\n",
@ -61,7 +60,7 @@
" if \"tls\" in word.lower():\n",
" return True\n",
" #cert\n",
" if word.lower().startswith(\"cert\"):\n",
" if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n",
" return True\n",
" return False\n",
"\n",
@ -84,12 +83,12 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: \n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
@ -114,9 +113,9 @@
" 'isGerrit': 'meta.gerrit'\n",
"})\n",
"\n",
"# after 12-1-2012 before 12-1-2013\n",
"# after 9-3-2011 before 11-27-2013\n",
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1385856000) & (phab_df['date_created'] > 1354320000)]\n",
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1385596799) & (phab_df['date_created'] > 1315008000)]\n",
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]\n",
"\n",
"#removing headless conversations\n",
@ -151,16 +150,42 @@
{
"cell_type": "code",
"execution_count": 5,
"id": "ffd0b263",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique conversation_ids: 1074\n",
"Unique ids: 6515\n",
"Unique speakers: 305\n"
]
}
],
"source": [
"unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n",
"unique_ids = len(comment_phab_df['id'].unique())\n",
"unique_speakers = len(comment_phab_df['speaker'].unique())\n",
"\n",
"print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n",
"print(f\"Unique ids: {unique_ids}\")\n",
"print(f\"Unique speakers: {unique_speakers}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x14ba49228520>"
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495ecba4bb0>"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -181,7 +206,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
@ -214,7 +239,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
@ -260,7 +285,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
@ -273,7 +298,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
@ -283,7 +308,7 @@
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -294,7 +319,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
@ -302,7 +327,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
@ -310,13 +335,7 @@
" \"\"\"Entry point for launching an IPython kernel.\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \n"
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n"
]
}
],
@ -354,18 +373,16 @@
]
}
],
"source": [
"comment_phab_df['resolved_text'][46088]"
]
"source": []
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"id": "92bf47ae",
"metadata": {},
"outputs": [],
"source": [
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/041325_coref_rel_phab_comments.csv\", index=False)"
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/051825_coref_rel_phab_comments.csv\", index=False)"
]
}
],

View File

Before

Width:  |  Height:  |  Size: 55 KiB

After

Width:  |  Height:  |  Size: 55 KiB

View File

@ -1,48 +0,0 @@
library(dplyr)
library(ggplot2)
phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv"
phab_data <- read.csv(phab_data_path, header=TRUE)
phab_data <- phab_data |>
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |>
mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |>
mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
mutate(comment_id = row_number())|>
filter(date_created < 1383264000 & date_created > 1351728000)
#looking at all data between 11-1-2012 and 11-1-2013
length(unique(phab_data$date_created))
#g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) +
# geom_point(alpha = 0.5) +
# theme_minimal()
#g
library(udpipe)
#library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file
library(tidytext)
library(dplyr)
library(stringr)
# we first need to transform our comment level of analysis into sentences
sentence_level_data <- phab_data |>
unnest_tokens(sentence, comment_text, token = "sentences") |>
group_by(comment_id) |>
mutate(sentence_id = row_number())|>
dplyr::select(-has_bot_ref, -has_ref)|>
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |>
mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |>
ungroup()
library(udpipe)
library(rsyntax)
# Load necessary libraries
library(spacyr)
spacy_install()
#we only care about stuff that mentions VE rn, then tokenize
sentence_level_data <- sentence_level_data |>
filter(has_ref == TRUE) |>
mutate(sentence_tokens = udpipe(sentence, "english"))

File diff suppressed because one or more lines are too long

View File

@ -1,219 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import pandas as pd \n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/spacy/util.py:910: UserWarning: [W095] Model 'en_coreference_web_trf' (3.4.0a2) was trained with spaCy v3.3.0 and may not be 100% compatible with the current version (3.7.5). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
" warnings.warn(warn_msg)\n"
]
},
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495edce13c0>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp = spacy.load(\"en_core_web_trf\")\n",
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
"\n",
"# use replace_listeners for the coref components\n",
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
"\n",
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
"nlp.add_pipe(\"merge_entities\")\n",
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/explosion/spaCy/discussions/13572\n",
"# https://github.com/explosion/spaCy/issues/13111 \n",
"# https://explosion.ai/blog/coref\n",
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
"source": [
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"from spacy.tokens import Doc\n",
"# Define lightweight function for resolving references in text\n",
"def resolve_references(doc: Doc) -> str:\n",
" \"\"\"Function for resolving references with the coref ouput\n",
" doc (Doc): The Doc object processed by the coref pipeline\n",
" RETURNS (str): The Doc string with resolved references\n",
" \"\"\"\n",
" # token.idx : token.text\n",
" token_mention_mapper = {}\n",
" output_string = \"\"\n",
" clusters = [\n",
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
" ]\n",
"\n",
" # Iterate through every found cluster\n",
" for cluster in clusters:\n",
" first_mention = cluster[0]\n",
" # Iterate through every other span in the cluster\n",
" for mention_span in list(cluster)[1:]:\n",
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
" \n",
" for token in mention_span[1:]:\n",
" # Set empty string for all the other tokens in mention_span\n",
" token_mention_mapper[token.idx] = \"\"\n",
"\n",
" # Iterate through every token in the Doc\n",
" for token in doc:\n",
" # Check if token exists in token_mention_mapper\n",
" if token.idx in token_mention_mapper:\n",
" output_string += token_mention_mapper[token.idx]\n",
" # Else add original token text\n",
" else:\n",
" output_string += token.text + token.whitespace_\n",
"\n",
" return output_string\n"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
"source": [
"def resolving_comment(text):\n",
" doc = nlp(text)\n",
" resolved_text = resolve_references(doc)\n",
" return resolved_text"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/thinc/shims/pytorch.py:114: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
" with torch.cuda.amp.autocast(self._mixed_precision):\n"
]
}
],
"source": [
"phab_df['text'] = phab_df['comment_text'].apply(str)\n",
"phab_df['resolved_text'] = phab_df['text'].apply(resolving_comment)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,393 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import pandas as pd \n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/0402_https1_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e30e81ad",
"metadata": {},
"outputs": [],
"source": [
"#because of compute issues, need to do the sampling before the coreference resolution\n",
"def http_relevant(text):\n",
" if pd.isnull(text):\n",
" return False\n",
" # expanded dictionary for relevancy\n",
" # http, login, SSL, TLS, certificate \n",
" for word in text.split():\n",
" if \"://\" not in word.lower():\n",
" #http\n",
" if \"http\" in word.lower():\n",
" return True\n",
" #login\n",
" if \"login\" in word.lower():\n",
" return True\n",
" #ssl\n",
" if \"ssl\" in word.lower():\n",
" return True\n",
" #tls\n",
" if \"tls\" in word.lower():\n",
" return True\n",
" #cert\n",
" if word.lower().startswith(\"cert\"):\n",
" return True\n",
" return False\n",
"\n",
"def is_migrated(comment_text):\n",
" if pd.isnull(comment_text):\n",
" return False\n",
" text = comment_text.strip()\n",
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
" return True \n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f359805f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
}
],
"source": [
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
"\n",
"#cleaning df\n",
"phab_df['id'] = phab_df.index + 1\n",
"#may have to build out the reply_to column \n",
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
"\n",
"phab_df = phab_df.rename(columns={\n",
" 'AuthorPHID': 'speaker',\n",
" 'TaskPHID': 'conversation_id',\n",
" 'WMFaffil':'meta.affil',\n",
" 'isGerrit': 'meta.gerrit'\n",
"})\n",
"\n",
"# after 12-1-2012 before 12-1-2013\n",
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1385856000) & (phab_df['date_created'] > 1354320000)]\n",
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]\n",
"\n",
"#removing headless conversations\n",
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
"\n",
"#removing gerrit comments \n",
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
"\n",
"# filter out the sourceforge migration \n",
"# Originally from: http://sourceforge.net in the task task_summary\n",
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
"\n",
"#cut down to only the data that is relevant (mentions http)\n",
"relevant_conversation_ids = task_phab_df[\n",
" task_phab_df['comment_text'].apply(http_relevant) |\n",
" task_phab_df['task_title'].apply(http_relevant)\n",
"]['conversation_id'].unique()\n",
"\n",
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"\n",
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"\n",
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
"#comment_phab_df = mid_comment_phab_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x14ba49228520>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp = spacy.load(\"en_core_web_trf\")\n",
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
"\n",
"# use replace_listeners for the coref components\n",
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
"\n",
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
"nlp.add_pipe(\"merge_entities\")\n",
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/explosion/spaCy/discussions/13572\n",
"# https://github.com/explosion/spaCy/issues/13111 \n",
"# https://explosion.ai/blog/coref\n",
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "424d35e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
"source": [
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"from spacy.tokens import Doc\n",
"# Define lightweight function for resolving references in text\n",
"def resolve_references(doc: Doc) -> str:\n",
" \"\"\"Function for resolving references with the coref ouput\n",
" doc (Doc): The Doc object processed by the coref pipeline\n",
" RETURNS (str): The Doc string with resolved references\n",
" \"\"\"\n",
" # token.idx : token.text\n",
" token_mention_mapper = {}\n",
" output_string = \"\"\n",
" clusters = [\n",
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
" ]\n",
"\n",
" # Iterate through every found cluster\n",
" for cluster in clusters:\n",
" first_mention = cluster[0]\n",
" # Iterate through every other span in the cluster\n",
" for mention_span in list(cluster)[1:]:\n",
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
" \n",
" for token in mention_span[1:]:\n",
" # Set empty string for all the other tokens in mention_span\n",
" token_mention_mapper[token.idx] = \"\"\n",
"\n",
" # Iterate through every token in the Doc\n",
" for token in doc:\n",
" # Check if token exists in token_mention_mapper\n",
" if token.idx in token_mention_mapper:\n",
" output_string += token_mention_mapper[token.idx]\n",
" # Else add original token text\n",
" else:\n",
" output_string += token.text + token.whitespace_\n",
"\n",
" return output_string\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
"source": [
"def resolving_comment(text):\n",
" doc = nlp(text)\n",
" resolved_text = resolve_references(doc)\n",
" return resolved_text"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \n"
]
}
],
"source": [
"comment_phab_df['text'] = comment_phab_df['comment_text'].apply(str)\n",
"comment_phab_df['resolved_text'] = comment_phab_df['text'].apply(resolving_comment)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "46088",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3360\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3361\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 46088",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_61233/1116300830.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcomment_phab_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'resolved_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m46088\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 940\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 941\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mkey_is_scalar\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 942\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 943\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 944\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_hashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m_get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1049\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1050\u001b[0m \u001b[0;31m# Similar to Index.get_value, but we do not fall back to positional\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1051\u001b[0;31m \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1052\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_values_for_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1053\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3361\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3365\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhasnans\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 46088"
]
}
],
"source": [
"comment_phab_df['resolved_text'][46088]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "92bf47ae",
"metadata": {},
"outputs": [],
"source": [
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/041325_coref_rel_phab_comments.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,487 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "fcc726a8-44a4-48cf-a1cd-937b05bd4d08",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1fceca29-48c1-4ba3-93ba-88724dea22a7",
"metadata": {},
"outputs": [],
"source": [
"first_resolved_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/051725_coref_rel_phab_comments_to_2014.csv\"\n",
"first_resolved_df = pd.read_csv(first_resolved_path)\n",
"second_resolved_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/051725_coref_rel_phab_comments_2014_to_2015.csv\"\n",
"second_resolved_df = pd.read_csv(second_resolved_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f26c31e7-bee1-4100-821f-769e5b1791bd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8621"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(second_resolved_df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "dfa81ca2-4d66-4679-bc3e-192d0cac67fa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5007"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(first_resolved_df)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6dc11bda-f0f6-4eb6-96f5-02ed9a3492ba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13628"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined_df = pd.concat([first_resolved_df, second_resolved_df])\n",
"unique_df = combined_df.drop_duplicates()\n",
"len(unique_df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0c903199-8159-455c-aa7f-e57ef07ce03e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>task_title</th>\n",
" <th>comment_text</th>\n",
" <th>date_created</th>\n",
" <th>speaker</th>\n",
" <th>meta.affil</th>\n",
" <th>conversation_id</th>\n",
" <th>comment_type</th>\n",
" <th>status</th>\n",
" <th>meta.gerrit</th>\n",
" <th>id</th>\n",
" <th>reply_to</th>\n",
" <th>timestamp</th>\n",
" <th>is_relevant</th>\n",
" <th>is_migrated</th>\n",
" <th>text</th>\n",
" <th>resolved_text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
" <td>1411541280</td>\n",
" <td>PHID-USER-v7vgzvvcw7v2umf737ri</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_description</td>\n",
" <td>resolved</td>\n",
" <td>False</td>\n",
" <td>243215</td>\n",
" <td>NaN</td>\n",
" <td>2014-09-24 06:48:00+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>Revert has been deployed.</td>\n",
" <td>1411573104</td>\n",
" <td>PHID-USER-v7vgzvvcw7v2umf737ri</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>243216</td>\n",
" <td>243215.0</td>\n",
" <td>2014-09-24 15:38:24+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Revert has been deployed.</td>\n",
" <td>Revert has been deployed.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
" <td>1411572378</td>\n",
" <td>PHID-USER-ynivjflmc2dcl6w5ut5v</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>243217</td>\n",
" <td>243216.0</td>\n",
" <td>2014-09-24 15:26:18+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
" <td>1411545535</td>\n",
" <td>PHID-USER-v7bwpq3rs3zdxegibdbh</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>243218</td>\n",
" <td>243217.0</td>\n",
" <td>2014-09-24 07:58:55+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
" <td>1411542640</td>\n",
" <td>PHID-USER-ynivjflmc2dcl6w5ut5v</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>243219</td>\n",
" <td>243218.0</td>\n",
" <td>2014-09-24 07:10:40+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8616</th>\n",
" <td>OAuth login refers to mediawiki.org:/ instead ...</td>\n",
" <td>&gt; When I registered, phabricator linked mediaw...</td>\n",
" <td>1413205650</td>\n",
" <td>PHID-USER-hgn5uw2jafgjgfvxibhh</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-yeaxsfxhhtbn26koo5fi</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>378799</td>\n",
" <td>378798.0</td>\n",
" <td>2014-10-13 13:07:30+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>&gt; When I registered, phabricator linked mediaw...</td>\n",
" <td>&gt; When I registered, phabricator linked mediaw...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8617</th>\n",
" <td>OAuth login refers to mediawiki.org:/ instead ...</td>\n",
" <td>See {T574} for a related discussion.</td>\n",
" <td>1412958953</td>\n",
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-yeaxsfxhhtbn26koo5fi</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>378800</td>\n",
" <td>378799.0</td>\n",
" <td>2014-10-10 16:35:53+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>See {T574} for a related discussion.</td>\n",
" <td>See {T574} for a related discussion.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8618</th>\n",
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
" <td>1412362816</td>\n",
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
" <td>task_description</td>\n",
" <td>declined</td>\n",
" <td>False</td>\n",
" <td>378858</td>\n",
" <td>NaN</td>\n",
" <td>2014-10-03 19:00:16+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8619</th>\n",
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
" <td>I guess the same restrictions as in T543 apply...</td>\n",
" <td>1412415111</td>\n",
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>378860</td>\n",
" <td>378859.0</td>\n",
" <td>2014-10-04 09:31:51+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>I guess the same restrictions as in T543 apply...</td>\n",
" <td>I guess the same restrictions as in T543 apply...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8620</th>\n",
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
" <td>It's not entirely trivial to change</td>\n",
" <td>1412366627</td>\n",
" <td>PHID-USER-fn7qnpccfbitivgtw2rt</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>378861</td>\n",
" <td>378860.0</td>\n",
" <td>2014-10-03 20:03:47+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>It's not entirely trivial to change</td>\n",
" <td>It's not entirely trivial to change</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>13628 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" task_title \\\n",
"0 User with unattached accounts unable to login ... \n",
"1 User with unattached accounts unable to login ... \n",
"2 User with unattached accounts unable to login ... \n",
"3 User with unattached accounts unable to login ... \n",
"4 User with unattached accounts unable to login ... \n",
"... ... \n",
"8616 OAuth login refers to mediawiki.org:/ instead ... \n",
"8617 OAuth login refers to mediawiki.org:/ instead ... \n",
"8618 Improvements to Wikimedia SUL login dialog UI:... \n",
"8619 Improvements to Wikimedia SUL login dialog UI:... \n",
"8620 Improvements to Wikimedia SUL login dialog UI:... \n",
"\n",
" comment_text date_created \\\n",
"0 User:NickK reported in IRC that they're gettin... 1411541280 \n",
"1 Revert has been deployed. 1411573104 \n",
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... 1411572378 \n",
"3 (In reply to Kunal Mehta (Legoktm) from commen... 1411545535 \n",
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... 1411542640 \n",
"... ... ... \n",
"8616 > When I registered, phabricator linked mediaw... 1413205650 \n",
"8617 See {T574} for a related discussion. 1412958953 \n",
"8618 Some improvements to the Wikimedia SUL dialog:... 1412362816 \n",
"8619 I guess the same restrictions as in T543 apply... 1412415111 \n",
"8620 It's not entirely trivial to change 1412366627 \n",
"\n",
" speaker meta.affil \\\n",
"0 PHID-USER-v7vgzvvcw7v2umf737ri False \n",
"1 PHID-USER-v7vgzvvcw7v2umf737ri False \n",
"2 PHID-USER-ynivjflmc2dcl6w5ut5v False \n",
"3 PHID-USER-v7bwpq3rs3zdxegibdbh False \n",
"4 PHID-USER-ynivjflmc2dcl6w5ut5v False \n",
"... ... ... \n",
"8616 PHID-USER-hgn5uw2jafgjgfvxibhh False \n",
"8617 PHID-USER-lluzkul4z7us4sxkayss False \n",
"8618 PHID-USER-lluzkul4z7us4sxkayss False \n",
"8619 PHID-USER-lluzkul4z7us4sxkayss False \n",
"8620 PHID-USER-fn7qnpccfbitivgtw2rt False \n",
"\n",
" conversation_id comment_type status meta.gerrit \\\n",
"0 PHID-TASK-mio2uq45ny7mms72syut task_description resolved False \n",
"1 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
"2 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
"3 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
"4 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
"... ... ... ... ... \n",
"8616 PHID-TASK-yeaxsfxhhtbn26koo5fi task_subcomment NaN False \n",
"8617 PHID-TASK-yeaxsfxhhtbn26koo5fi task_subcomment NaN False \n",
"8618 PHID-TASK-j6czqxlv5fzcx3tmq23n task_description declined False \n",
"8619 PHID-TASK-j6czqxlv5fzcx3tmq23n task_subcomment NaN False \n",
"8620 PHID-TASK-j6czqxlv5fzcx3tmq23n task_subcomment NaN False \n",
"\n",
" id reply_to timestamp is_relevant is_migrated \\\n",
"0 243215 NaN 2014-09-24 06:48:00+00:00 True False \n",
"1 243216 243215.0 2014-09-24 15:38:24+00:00 True False \n",
"2 243217 243216.0 2014-09-24 15:26:18+00:00 True False \n",
"3 243218 243217.0 2014-09-24 07:58:55+00:00 True False \n",
"4 243219 243218.0 2014-09-24 07:10:40+00:00 True False \n",
"... ... ... ... ... ... \n",
"8616 378799 378798.0 2014-10-13 13:07:30+00:00 True False \n",
"8617 378800 378799.0 2014-10-10 16:35:53+00:00 True False \n",
"8618 378858 NaN 2014-10-03 19:00:16+00:00 True False \n",
"8619 378860 378859.0 2014-10-04 09:31:51+00:00 True False \n",
"8620 378861 378860.0 2014-10-03 20:03:47+00:00 True False \n",
"\n",
" text \\\n",
"0 User:NickK reported in IRC that they're gettin... \n",
"1 Revert has been deployed. \n",
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... \n",
"3 (In reply to Kunal Mehta (Legoktm) from commen... \n",
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... \n",
"... ... \n",
"8616 > When I registered, phabricator linked mediaw... \n",
"8617 See {T574} for a related discussion. \n",
"8618 Some improvements to the Wikimedia SUL dialog:... \n",
"8619 I guess the same restrictions as in T543 apply... \n",
"8620 It's not entirely trivial to change \n",
"\n",
" resolved_text \n",
"0 User:NickK reported in IRC that they're gettin... \n",
"1 Revert has been deployed. \n",
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... \n",
"3 (In reply to Kunal Mehta (Legoktm) from commen... \n",
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... \n",
"... ... \n",
"8616 > When I registered, phabricator linked mediaw... \n",
"8617 See {T574} for a related discussion. \n",
"8618 Some improvements to the Wikimedia SUL dialog:... \n",
"8619 I guess the same restrictions as in T543 apply... \n",
"8620 It's not entirely trivial to change \n",
"\n",
"[13628 rows x 16 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unique_df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0c392d70-6236-4dfe-b6d4-bbe3f422b09e",
"metadata": {},
"outputs": [],
"source": [
"unique_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0050825_coref-rel-first.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@ -1,779 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import pandas as pd \n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0422_http_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e30e81ad",
"metadata": {},
"outputs": [],
"source": [
"#because of compute issues, need to do the sampling before the coreference resolution\n",
"def http_relevant(text):\n",
" if pd.isnull(text):\n",
" return False\n",
" # expanded dictionary for relevancy\n",
" # http, login, SSL, TLS, certificate \n",
" for word in text.split():\n",
" if \"://\" not in word.lower():\n",
" #http\n",
" if \"http\" in word.lower():\n",
" return True\n",
" #login\n",
" if \"login\" in word.lower():\n",
" return True\n",
" #ssl\n",
" if \"ssl\" in word.lower():\n",
" return True\n",
" #tls\n",
" if \"tls\" in word.lower():\n",
" return True\n",
" #cert\n",
" if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n",
" return True\n",
" return False\n",
"\n",
"def is_migrated(comment_text):\n",
" if pd.isnull(comment_text):\n",
" return False\n",
" text = comment_text.strip()\n",
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
" return True \n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f359805f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:45: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
}
],
"source": [
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
"\n",
"#cleaning df\n",
"phab_df['id'] = phab_df.index + 1\n",
"#may have to build out the reply_to column \n",
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
"\n",
"phab_df = phab_df.rename(columns={\n",
" 'AuthorPHID': 'speaker',\n",
" 'TaskPHID': 'conversation_id',\n",
" 'WMFaffil':'meta.affil',\n",
" 'isGerrit': 'meta.gerrit'\n",
"})\n",
"\n",
"# after 10-01-2014 before 10-01-2015\n",
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] >= 1412207999)]\n",
"# after 07-01-2013 before 10-01-2015\n",
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] > 1372636800)]\n",
"\n",
"#removing headless conversations\n",
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
"\n",
"#removing gerrit comments \n",
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
"\n",
"# filter out the sourceforge migration \n",
"# Originally from: http://sourceforge.net in the task task_summary\n",
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
"\n",
"#cut down to only the data that is relevant (mentions http)\n",
"relevant_conversation_ids = task_phab_df[\n",
" task_phab_df['comment_text'].apply(http_relevant) |\n",
" task_phab_df['task_title'].apply(http_relevant)\n",
"]['conversation_id'].unique()\n",
"\n",
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"\n",
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"\n",
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
"#comment_phab_df = mid_comment_phab_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4241cb0a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>task_title</th>\n",
" <th>comment_text</th>\n",
" <th>date_created</th>\n",
" <th>speaker</th>\n",
" <th>meta.affil</th>\n",
" <th>conversation_id</th>\n",
" <th>comment_type</th>\n",
" <th>status</th>\n",
" <th>meta.gerrit</th>\n",
" <th>id</th>\n",
" <th>reply_to</th>\n",
" <th>timestamp</th>\n",
" <th>is_relevant</th>\n",
" <th>is_migrated</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>After a replace of old instances, it is not po...</td>\n",
" <td>1442753295</td>\n",
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_description</td>\n",
" <td>resolved</td>\n",
" <td>False</td>\n",
" <td>198</td>\n",
" <td>NaN</td>\n",
" <td>2015-09-20 12:48:15+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>Works now.</td>\n",
" <td>1442864673</td>\n",
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>199</td>\n",
" <td>198.0</td>\n",
" <td>2015-09-21 19:44:33+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>Ok, the instances are deleted now, I will recr...</td>\n",
" <td>1442864271</td>\n",
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>200</td>\n",
" <td>199.0</td>\n",
" <td>2015-09-21 19:37:51+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>200</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>The new instances have the same names as recen...</td>\n",
" <td>1442854156</td>\n",
" <td>PHID-USER-22bsa5u75jz3ci3wnplu</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>201</td>\n",
" <td>200.0</td>\n",
" <td>2015-09-21 16:49:16+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>201</th>\n",
" <td>Creation of instances broken</td>\n",
" <td>This happens also with jessie and presice inst...</td>\n",
" <td>1442835238</td>\n",
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>202</td>\n",
" <td>201.0</td>\n",
" <td>2015-09-21 11:33:58+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406887</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>*** Bug 49454 has been marked as a duplicate o...</td>\n",
" <td>1379011061</td>\n",
" <td>PHID-USER-2nnm76h4ykalvvref2ye</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406888</td>\n",
" <td>406887.0</td>\n",
" <td>2013-09-12 18:37:41+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406888</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>JFTR, on Tools mosh-server processes eat up to...</td>\n",
" <td>1376245807</td>\n",
" <td>PHID-USER-vk6mlmacfhx77egryy5i</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406889</td>\n",
" <td>406888.0</td>\n",
" <td>2013-08-11 18:30:07+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406889</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>This is supported on tools, but adding it to t...</td>\n",
" <td>1376185312</td>\n",
" <td>PHID-USER-h75guknmwivm6x37iute</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406890</td>\n",
" <td>406889.0</td>\n",
" <td>2013-08-11 01:41:52+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406890</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>Just found out that mosh already works for too...</td>\n",
" <td>1376118400</td>\n",
" <td>PHID-USER-5dqihbanu3caaj7pigif</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406891</td>\n",
" <td>406890.0</td>\n",
" <td>2013-08-10 07:06:40+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406891</th>\n",
" <td>Allow login using mosh as an alternative to pl...</td>\n",
" <td>(In reply to comment #0)\\n&gt; ssh is quite painf...</td>\n",
" <td>1376118251</td>\n",
" <td>PHID-USER-6vzzsmi22zem6yttr6vp</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>406892</td>\n",
" <td>406891.0</td>\n",
" <td>2013-08-10 07:04:11+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>14490 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" task_title \\\n",
"197 Creation of instances broken \n",
"198 Creation of instances broken \n",
"199 Creation of instances broken \n",
"200 Creation of instances broken \n",
"201 Creation of instances broken \n",
"... ... \n",
"406887 Allow login using mosh as an alternative to pl... \n",
"406888 Allow login using mosh as an alternative to pl... \n",
"406889 Allow login using mosh as an alternative to pl... \n",
"406890 Allow login using mosh as an alternative to pl... \n",
"406891 Allow login using mosh as an alternative to pl... \n",
"\n",
" comment_text date_created \\\n",
"197 After a replace of old instances, it is not po... 1442753295 \n",
"198 Works now. 1442864673 \n",
"199 Ok, the instances are deleted now, I will recr... 1442864271 \n",
"200 The new instances have the same names as recen... 1442854156 \n",
"201 This happens also with jessie and presice inst... 1442835238 \n",
"... ... ... \n",
"406887 *** Bug 49454 has been marked as a duplicate o... 1379011061 \n",
"406888 JFTR, on Tools mosh-server processes eat up to... 1376245807 \n",
"406889 This is supported on tools, but adding it to t... 1376185312 \n",
"406890 Just found out that mosh already works for too... 1376118400 \n",
"406891 (In reply to comment #0)\\n> ssh is quite painf... 1376118251 \n",
"\n",
" speaker meta.affil \\\n",
"197 PHID-USER-qlodcndtwpolbdhncjis False \n",
"198 PHID-USER-qlodcndtwpolbdhncjis False \n",
"199 PHID-USER-qlodcndtwpolbdhncjis False \n",
"200 PHID-USER-22bsa5u75jz3ci3wnplu False \n",
"201 PHID-USER-qlodcndtwpolbdhncjis False \n",
"... ... ... \n",
"406887 PHID-USER-2nnm76h4ykalvvref2ye False \n",
"406888 PHID-USER-vk6mlmacfhx77egryy5i False \n",
"406889 PHID-USER-h75guknmwivm6x37iute False \n",
"406890 PHID-USER-5dqihbanu3caaj7pigif False \n",
"406891 PHID-USER-6vzzsmi22zem6yttr6vp False \n",
"\n",
" conversation_id comment_type status \\\n",
"197 PHID-TASK-pitdrld6mszruqmc6usf task_description resolved \n",
"198 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
"199 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
"200 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
"201 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
"... ... ... ... \n",
"406887 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"406888 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"406889 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"406890 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"406891 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
"\n",
" meta.gerrit id reply_to timestamp is_relevant \\\n",
"197 False 198 NaN 2015-09-20 12:48:15+00:00 True \n",
"198 False 199 198.0 2015-09-21 19:44:33+00:00 True \n",
"199 False 200 199.0 2015-09-21 19:37:51+00:00 True \n",
"200 False 201 200.0 2015-09-21 16:49:16+00:00 True \n",
"201 False 202 201.0 2015-09-21 11:33:58+00:00 True \n",
"... ... ... ... ... ... \n",
"406887 False 406888 406887.0 2013-09-12 18:37:41+00:00 True \n",
"406888 False 406889 406888.0 2013-08-11 18:30:07+00:00 True \n",
"406889 False 406890 406889.0 2013-08-11 01:41:52+00:00 True \n",
"406890 False 406891 406890.0 2013-08-10 07:06:40+00:00 True \n",
"406891 False 406892 406891.0 2013-08-10 07:04:11+00:00 True \n",
"\n",
" is_migrated \n",
"197 False \n",
"198 False \n",
"199 False \n",
"200 False \n",
"201 False \n",
"... ... \n",
"406887 False \n",
"406888 False \n",
"406889 False \n",
"406890 False \n",
"406891 False \n",
"\n",
"[14490 rows x 14 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"comment_phab_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "930c4d9c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
},
{
"data": {
"text/plain": [
"862"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prior_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0050825_coref-rel-first.csv\"\n",
"prior_df = pd.read_csv(prior_path)\n",
"comment_phab_df['timestamp'] = pd.to_datetime(comment_phab_df['timestamp'], utc=True)\n",
"prior_df['timestamp'] = pd.to_datetime(prior_df['timestamp'], utc=True)\n",
"merged_df = comment_phab_df.merge(prior_df, how='outer', indicator=True)\n",
"len(merged_df)\n",
"only_in_comment_phab_df = merged_df[merged_df['_merge'] == 'left_only']\n",
"len(only_in_comment_phab_df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x154d9952a7c0>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp = spacy.load(\"en_core_web_trf\")\n",
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
"\n",
"# use replace_listeners for the coref components\n",
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
"\n",
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
"nlp.add_pipe(\"merge_entities\")\n",
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/explosion/spaCy/discussions/13572\n",
"# https://github.com/explosion/spaCy/issues/13111 \n",
"# https://explosion.ai/blog/coref\n",
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "424d35e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
"source": [
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"from spacy.tokens import Doc\n",
"# Define lightweight function for resolving references in text\n",
"def resolve_references(doc: Doc) -> str:\n",
" \"\"\"Function for resolving references with the coref ouput\n",
" doc (Doc): The Doc object processed by the coref pipeline\n",
" RETURNS (str): The Doc string with resolved references\n",
" \"\"\"\n",
" # token.idx : token.text\n",
" token_mention_mapper = {}\n",
" output_string = \"\"\n",
" clusters = [\n",
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
" ]\n",
"\n",
" # Iterate through every found cluster\n",
" for cluster in clusters:\n",
" first_mention = cluster[0]\n",
" # Iterate through every other span in the cluster\n",
" for mention_span in list(cluster)[1:]:\n",
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
" \n",
" for token in mention_span[1:]:\n",
" # Set empty string for all the other tokens in mention_span\n",
" token_mention_mapper[token.idx] = \"\"\n",
"\n",
" # Iterate through every token in the Doc\n",
" for token in doc:\n",
" # Check if token exists in token_mention_mapper\n",
" if token.idx in token_mention_mapper:\n",
" output_string += token_mention_mapper[token.idx]\n",
" # Else add original token text\n",
" else:\n",
" output_string += token.text + token.whitespace_\n",
"\n",
" return output_string\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
"source": [
"def resolving_comment(text):\n",
" doc = nlp(text)\n",
" resolved_text = resolve_references(doc)\n",
" return resolved_text"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
}
],
"source": [
"only_in_comment_phab_df['text'] = only_in_comment_phab_df['comment_text'].apply(str)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "79e3f7e2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors\n",
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
}
],
"source": [
"only_in_comment_phab_df['resolved_text'] = only_in_comment_phab_df['text'].apply(resolving_comment)\n",
"only_in_comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"7423 [Backport was merged into 1.24wmf16 upon a tim...\n",
"7902 I guess this can be closed now as RESOLVED WOR...\n",
"7905 The upstream issue is https://github.com/jcgre...\n",
"7906 An update on this. In Amsterdam we found at th...\n",
"7907 Yes. It's used by people using pywikibot-as-a-...\n",
" ... \n",
"14465 I amended the title to the range IE8-10 becaus...\n",
"14466 If I remember correctly this problem was at le...\n",
"14467 If I remember correctly this problem was at le...\n",
"14468 After a quick test, autocomplete seems to work...\n",
"14478 Still not merged, so we can't really do much.\n",
"Name: resolved_text, Length: 862, dtype: object"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"only_in_comment_phab_df['resolved_text']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92bf47ae",
"metadata": {},
"outputs": [],
"source": [
"only_in_comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long