reorganizing
7
.gitignore
vendored
@ -1,10 +1,15 @@
|
||||
# ignore the R studio docker image needed by hyak
|
||||
rstudio_latest.sif
|
||||
|
||||
rstudio-server.job
|
||||
|
||||
# do not need to include any R items
|
||||
.Rhistory
|
||||
.cache/
|
||||
.config/
|
||||
.local/
|
||||
.RData
|
||||
|
||||
#can leave out misc tooling
|
||||
.sh_history
|
||||
.ipynb_checkpoints
|
||||
|
||||
|
Before Width: | Height: | Size: 102 KiB After Width: | Height: | Size: 102 KiB |
Before Width: | Height: | Size: 69 KiB After Width: | Height: | Size: 69 KiB |
Before Width: | Height: | Size: 145 KiB After Width: | Height: | Size: 145 KiB |
Before Width: | Height: | Size: 77 KiB After Width: | Height: | Size: 77 KiB |
Before Width: | Height: | Size: 92 KiB After Width: | Height: | Size: 92 KiB |
Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 46 KiB |
1032
phab_analysis/case2/040425_phab_comments.ipynb
Normal file
@ -10,9 +10,9 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
|
||||
" warnings.warn(\"Can't initialize NVML\")\n"
|
||||
]
|
||||
}
|
||||
@ -29,7 +29,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/0402_https1_phab_comments.csv\"\n",
|
||||
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/0514_https_phab_comments.csv\"\n",
|
||||
"phab_df = pd.read_csv(phab_path)"
|
||||
]
|
||||
},
|
||||
@ -40,7 +40,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#because of compute issues, need to do the sampling before the coreference resolution\n",
|
||||
"def http_relevant(text):\n",
|
||||
" if pd.isnull(text):\n",
|
||||
" return False\n",
|
||||
@ -61,7 +60,7 @@
|
||||
" if \"tls\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #cert\n",
|
||||
" if word.lower().startswith(\"cert\"):\n",
|
||||
" if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n",
|
||||
" return True\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
@ -84,12 +83,12 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: \n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
@ -114,9 +113,9 @@
|
||||
" 'isGerrit': 'meta.gerrit'\n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# after 12-1-2012 before 12-1-2013\n",
|
||||
"# after 9-3-2011 before 11-27-2013\n",
|
||||
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
|
||||
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1385856000) & (phab_df['date_created'] > 1354320000)]\n",
|
||||
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1385596799) & (phab_df['date_created'] > 1315008000)]\n",
|
||||
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]\n",
|
||||
"\n",
|
||||
"#removing headless conversations\n",
|
||||
@ -151,16 +150,42 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "ffd0b263",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Unique conversation_ids: 1074\n",
|
||||
"Unique ids: 6515\n",
|
||||
"Unique speakers: 305\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"unique_conversation_ids = len(comment_phab_df['conversation_id'].unique())\n",
|
||||
"unique_ids = len(comment_phab_df['id'].unique())\n",
|
||||
"unique_speakers = len(comment_phab_df['speaker'].unique())\n",
|
||||
"\n",
|
||||
"print(f\"Unique conversation_ids: {unique_conversation_ids}\")\n",
|
||||
"print(f\"Unique ids: {unique_ids}\")\n",
|
||||
"print(f\"Unique speakers: {unique_speakers}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x14ba49228520>"
|
||||
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495ecba4bb0>"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -181,7 +206,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 7,
|
||||
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -214,7 +239,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 8,
|
||||
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -260,7 +285,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 9,
|
||||
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -273,7 +298,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 10,
|
||||
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -283,7 +308,7 @@
|
||||
"'i hate ve.interface, ve.interface always messes up i browser'"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -294,7 +319,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": null,
|
||||
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -302,7 +327,7 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
@ -310,13 +335,7 @@
|
||||
" \"\"\"Entry point for launching an IPython kernel.\n",
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" \n"
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -354,18 +373,16 @@
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"comment_phab_df['resolved_text'][46088]"
|
||||
]
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": null,
|
||||
"id": "92bf47ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/041325_coref_rel_phab_comments.csv\", index=False)"
|
||||
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/051825_coref_rel_phab_comments.csv\", index=False)"
|
||||
]
|
||||
}
|
||||
],
|
Before Width: | Height: | Size: 55 KiB After Width: | Height: | Size: 55 KiB |
@ -1,48 +0,0 @@
|
||||
library(dplyr)
|
||||
library(ggplot2)
|
||||
phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv"
|
||||
phab_data <- read.csv(phab_data_path, header=TRUE)
|
||||
|
||||
phab_data <- phab_data |>
|
||||
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |>
|
||||
mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |>
|
||||
mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
||||
mutate(comment_id = row_number())|>
|
||||
filter(date_created < 1383264000 & date_created > 1351728000)
|
||||
#looking at all data between 11-1-2012 and 11-1-2013
|
||||
|
||||
length(unique(phab_data$date_created))
|
||||
|
||||
#g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) +
|
||||
# geom_point(alpha = 0.5) +
|
||||
# theme_minimal()
|
||||
#g
|
||||
|
||||
library(udpipe)
|
||||
#library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file
|
||||
|
||||
library(tidytext)
|
||||
library(dplyr)
|
||||
library(stringr)
|
||||
|
||||
# we first need to transform our comment level of analysis into sentences
|
||||
sentence_level_data <- phab_data |>
|
||||
unnest_tokens(sentence, comment_text, token = "sentences") |>
|
||||
group_by(comment_id) |>
|
||||
mutate(sentence_id = row_number())|>
|
||||
dplyr::select(-has_bot_ref, -has_ref)|>
|
||||
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |>
|
||||
mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |>
|
||||
ungroup()
|
||||
|
||||
|
||||
library(udpipe)
|
||||
library(rsyntax)
|
||||
# Load necessary libraries
|
||||
library(spacyr)
|
||||
spacy_install()
|
||||
#we only care about stuff that mentions VE rn, then tokenize
|
||||
sentence_level_data <- sentence_level_data |>
|
||||
filter(has_ref == TRUE) |>
|
||||
mutate(sentence_tokens = udpipe(sentence, "english"))
|
||||
|
@ -1,219 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n",
|
||||
" warnings.warn(\"Can't initialize NVML\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd \n",
|
||||
"import spacy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n",
|
||||
"phab_df = pd.read_csv(phab_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/spacy/util.py:910: UserWarning: [W095] Model 'en_coreference_web_trf' (3.4.0a2) was trained with spaCy v3.3.0 and may not be 100% compatible with the current version (3.7.5). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
|
||||
" warnings.warn(warn_msg)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495edce13c0>"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nlp = spacy.load(\"en_core_web_trf\")\n",
|
||||
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
|
||||
"\n",
|
||||
"# use replace_listeners for the coref components\n",
|
||||
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
|
||||
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
|
||||
"\n",
|
||||
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
|
||||
"nlp.add_pipe(\"merge_entities\")\n",
|
||||
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
|
||||
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://github.com/explosion/spaCy/discussions/13572\n",
|
||||
"# https://github.com/explosion/spaCy/issues/13111 \n",
|
||||
"# https://explosion.ai/blog/coref\n",
|
||||
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
|
||||
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
|
||||
"from spacy.tokens import Doc\n",
|
||||
"# Define lightweight function for resolving references in text\n",
|
||||
"def resolve_references(doc: Doc) -> str:\n",
|
||||
" \"\"\"Function for resolving references with the coref ouput\n",
|
||||
" doc (Doc): The Doc object processed by the coref pipeline\n",
|
||||
" RETURNS (str): The Doc string with resolved references\n",
|
||||
" \"\"\"\n",
|
||||
" # token.idx : token.text\n",
|
||||
" token_mention_mapper = {}\n",
|
||||
" output_string = \"\"\n",
|
||||
" clusters = [\n",
|
||||
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # Iterate through every found cluster\n",
|
||||
" for cluster in clusters:\n",
|
||||
" first_mention = cluster[0]\n",
|
||||
" # Iterate through every other span in the cluster\n",
|
||||
" for mention_span in list(cluster)[1:]:\n",
|
||||
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
|
||||
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
|
||||
" \n",
|
||||
" for token in mention_span[1:]:\n",
|
||||
" # Set empty string for all the other tokens in mention_span\n",
|
||||
" token_mention_mapper[token.idx] = \"\"\n",
|
||||
"\n",
|
||||
" # Iterate through every token in the Doc\n",
|
||||
" for token in doc:\n",
|
||||
" # Check if token exists in token_mention_mapper\n",
|
||||
" if token.idx in token_mention_mapper:\n",
|
||||
" output_string += token_mention_mapper[token.idx]\n",
|
||||
" # Else add original token text\n",
|
||||
" else:\n",
|
||||
" output_string += token.text + token.whitespace_\n",
|
||||
"\n",
|
||||
" return output_string\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def resolving_comment(text):\n",
|
||||
" doc = nlp(text)\n",
|
||||
" resolved_text = resolve_references(doc)\n",
|
||||
" return resolved_text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'i hate ve.interface, ve.interface always messes up i browser'"
|
||||
]
|
||||
},
|
||||
"execution_count": 73,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.10/site-packages/thinc/shims/pytorch.py:114: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
||||
" with torch.cuda.amp.autocast(self._mixed_precision):\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"phab_df['text'] = phab_df['comment_text'].apply(str)\n",
|
||||
"phab_df['resolved_text'] = phab_df['text'].apply(resolving_comment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,393 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
|
||||
" warnings.warn(\"Can't initialize NVML\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd \n",
|
||||
"import spacy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/0402_https1_phab_comments.csv\"\n",
|
||||
"phab_df = pd.read_csv(phab_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e30e81ad",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#because of compute issues, need to do the sampling before the coreference resolution\n",
|
||||
"def http_relevant(text):\n",
|
||||
" if pd.isnull(text):\n",
|
||||
" return False\n",
|
||||
" # expanded dictionary for relevancy\n",
|
||||
" # http, login, SSL, TLS, certificate \n",
|
||||
" for word in text.split():\n",
|
||||
" if \"://\" not in word.lower():\n",
|
||||
" #http\n",
|
||||
" if \"http\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #login\n",
|
||||
" if \"login\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #ssl\n",
|
||||
" if \"ssl\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #tls\n",
|
||||
" if \"tls\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #cert\n",
|
||||
" if word.lower().startswith(\"cert\"):\n",
|
||||
" return True\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
"def is_migrated(comment_text):\n",
|
||||
" if pd.isnull(comment_text):\n",
|
||||
" return False\n",
|
||||
" text = comment_text.strip()\n",
|
||||
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
|
||||
" return True \n",
|
||||
" return False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f359805f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
|
||||
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
|
||||
"\n",
|
||||
"#cleaning df\n",
|
||||
"phab_df['id'] = phab_df.index + 1\n",
|
||||
"#may have to build out the reply_to column \n",
|
||||
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
|
||||
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
|
||||
"\n",
|
||||
"phab_df = phab_df.rename(columns={\n",
|
||||
" 'AuthorPHID': 'speaker',\n",
|
||||
" 'TaskPHID': 'conversation_id',\n",
|
||||
" 'WMFaffil':'meta.affil',\n",
|
||||
" 'isGerrit': 'meta.gerrit'\n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# after 12-1-2012 before 12-1-2013\n",
|
||||
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
|
||||
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1385856000) & (phab_df['date_created'] > 1354320000)]\n",
|
||||
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]\n",
|
||||
"\n",
|
||||
"#removing headless conversations\n",
|
||||
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
|
||||
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
|
||||
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
|
||||
"\n",
|
||||
"#removing gerrit comments \n",
|
||||
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
|
||||
"\n",
|
||||
"# filter out the sourceforge migration \n",
|
||||
"# Originally from: http://sourceforge.net in the task task_summary\n",
|
||||
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
|
||||
"\n",
|
||||
"#cut down to only the data that is relevant (mentions http)\n",
|
||||
"relevant_conversation_ids = task_phab_df[\n",
|
||||
" task_phab_df['comment_text'].apply(http_relevant) |\n",
|
||||
" task_phab_df['task_title'].apply(http_relevant)\n",
|
||||
"]['conversation_id'].unique()\n",
|
||||
"\n",
|
||||
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
||||
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
||||
"\n",
|
||||
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
||||
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
||||
"\n",
|
||||
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
|
||||
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
|
||||
"#comment_phab_df = mid_comment_phab_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x14ba49228520>"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nlp = spacy.load(\"en_core_web_trf\")\n",
|
||||
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
|
||||
"\n",
|
||||
"# use replace_listeners for the coref components\n",
|
||||
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
|
||||
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
|
||||
"\n",
|
||||
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
|
||||
"nlp.add_pipe(\"merge_entities\")\n",
|
||||
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
|
||||
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://github.com/explosion/spaCy/discussions/13572\n",
|
||||
"# https://github.com/explosion/spaCy/issues/13111 \n",
|
||||
"# https://explosion.ai/blog/coref\n",
|
||||
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
|
||||
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "424d35e0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
|
||||
"from spacy.tokens import Doc\n",
|
||||
"# Define lightweight function for resolving references in text\n",
|
||||
"def resolve_references(doc: Doc) -> str:\n",
|
||||
" \"\"\"Function for resolving references with the coref ouput\n",
|
||||
" doc (Doc): The Doc object processed by the coref pipeline\n",
|
||||
" RETURNS (str): The Doc string with resolved references\n",
|
||||
" \"\"\"\n",
|
||||
" # token.idx : token.text\n",
|
||||
" token_mention_mapper = {}\n",
|
||||
" output_string = \"\"\n",
|
||||
" clusters = [\n",
|
||||
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # Iterate through every found cluster\n",
|
||||
" for cluster in clusters:\n",
|
||||
" first_mention = cluster[0]\n",
|
||||
" # Iterate through every other span in the cluster\n",
|
||||
" for mention_span in list(cluster)[1:]:\n",
|
||||
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
|
||||
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
|
||||
" \n",
|
||||
" for token in mention_span[1:]:\n",
|
||||
" # Set empty string for all the other tokens in mention_span\n",
|
||||
" token_mention_mapper[token.idx] = \"\"\n",
|
||||
"\n",
|
||||
" # Iterate through every token in the Doc\n",
|
||||
" for token in doc:\n",
|
||||
" # Check if token exists in token_mention_mapper\n",
|
||||
" if token.idx in token_mention_mapper:\n",
|
||||
" output_string += token_mention_mapper[token.idx]\n",
|
||||
" # Else add original token text\n",
|
||||
" else:\n",
|
||||
" output_string += token.text + token.whitespace_\n",
|
||||
"\n",
|
||||
" return output_string\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def resolving_comment(text):\n",
|
||||
" doc = nlp(text)\n",
|
||||
" resolved_text = resolve_references(doc)\n",
|
||||
" return resolved_text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'i hate ve.interface, ve.interface always messes up i browser'"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" \"\"\"Entry point for launching an IPython kernel.\n",
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"comment_phab_df['text'] = comment_phab_df['comment_text'].apply(str)\n",
|
||||
"comment_phab_df['resolved_text'] = comment_phab_df['text'].apply(resolving_comment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyError",
|
||||
"evalue": "46088",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3360\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3361\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;31mKeyError\u001b[0m: 46088",
|
||||
"\nThe above exception was the direct cause of the following exception:\n",
|
||||
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m/tmp/ipykernel_61233/1116300830.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcomment_phab_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'resolved_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m46088\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 940\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 941\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mkey_is_scalar\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 942\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 943\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 944\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_hashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m_get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1049\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1050\u001b[0m \u001b[0;31m# Similar to Index.get_value, but we do not fall back to positional\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1051\u001b[0;31m \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1052\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_values_for_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1053\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3361\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3365\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhasnans\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mKeyError\u001b[0m: 46088"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"comment_phab_df['resolved_text'][46088]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "92bf47ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/041325_coref_rel_phab_comments.csv\", index=False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,487 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "fcc726a8-44a4-48cf-a1cd-937b05bd4d08",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "1fceca29-48c1-4ba3-93ba-88724dea22a7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"first_resolved_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/051725_coref_rel_phab_comments_to_2014.csv\"\n",
|
||||
"first_resolved_df = pd.read_csv(first_resolved_path)\n",
|
||||
"second_resolved_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/051725_coref_rel_phab_comments_2014_to_2015.csv\"\n",
|
||||
"second_resolved_df = pd.read_csv(second_resolved_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f26c31e7-bee1-4100-821f-769e5b1791bd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"8621"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(second_resolved_df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "dfa81ca2-4d66-4679-bc3e-192d0cac67fa",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"5007"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(first_resolved_df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "6dc11bda-f0f6-4eb6-96f5-02ed9a3492ba",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"13628"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"combined_df = pd.concat([first_resolved_df, second_resolved_df])\n",
|
||||
"unique_df = combined_df.drop_duplicates()\n",
|
||||
"len(unique_df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0c903199-8159-455c-aa7f-e57ef07ce03e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>task_title</th>\n",
|
||||
" <th>comment_text</th>\n",
|
||||
" <th>date_created</th>\n",
|
||||
" <th>speaker</th>\n",
|
||||
" <th>meta.affil</th>\n",
|
||||
" <th>conversation_id</th>\n",
|
||||
" <th>comment_type</th>\n",
|
||||
" <th>status</th>\n",
|
||||
" <th>meta.gerrit</th>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>reply_to</th>\n",
|
||||
" <th>timestamp</th>\n",
|
||||
" <th>is_relevant</th>\n",
|
||||
" <th>is_migrated</th>\n",
|
||||
" <th>text</th>\n",
|
||||
" <th>resolved_text</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>User with unattached accounts unable to login ...</td>\n",
|
||||
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
|
||||
" <td>1411541280</td>\n",
|
||||
" <td>PHID-USER-v7vgzvvcw7v2umf737ri</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
|
||||
" <td>task_description</td>\n",
|
||||
" <td>resolved</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>243215</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2014-09-24 06:48:00+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
|
||||
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>User with unattached accounts unable to login ...</td>\n",
|
||||
" <td>Revert has been deployed.</td>\n",
|
||||
" <td>1411573104</td>\n",
|
||||
" <td>PHID-USER-v7vgzvvcw7v2umf737ri</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>243216</td>\n",
|
||||
" <td>243215.0</td>\n",
|
||||
" <td>2014-09-24 15:38:24+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Revert has been deployed.</td>\n",
|
||||
" <td>Revert has been deployed.</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>User with unattached accounts unable to login ...</td>\n",
|
||||
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
|
||||
" <td>1411572378</td>\n",
|
||||
" <td>PHID-USER-ynivjflmc2dcl6w5ut5v</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>243217</td>\n",
|
||||
" <td>243216.0</td>\n",
|
||||
" <td>2014-09-24 15:26:18+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
|
||||
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>User with unattached accounts unable to login ...</td>\n",
|
||||
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
|
||||
" <td>1411545535</td>\n",
|
||||
" <td>PHID-USER-v7bwpq3rs3zdxegibdbh</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>243218</td>\n",
|
||||
" <td>243217.0</td>\n",
|
||||
" <td>2014-09-24 07:58:55+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
|
||||
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>User with unattached accounts unable to login ...</td>\n",
|
||||
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
|
||||
" <td>1411542640</td>\n",
|
||||
" <td>PHID-USER-ynivjflmc2dcl6w5ut5v</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>243219</td>\n",
|
||||
" <td>243218.0</td>\n",
|
||||
" <td>2014-09-24 07:10:40+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
|
||||
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8616</th>\n",
|
||||
" <td>OAuth login refers to mediawiki.org:/ instead ...</td>\n",
|
||||
" <td>> When I registered, phabricator linked mediaw...</td>\n",
|
||||
" <td>1413205650</td>\n",
|
||||
" <td>PHID-USER-hgn5uw2jafgjgfvxibhh</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-yeaxsfxhhtbn26koo5fi</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>378799</td>\n",
|
||||
" <td>378798.0</td>\n",
|
||||
" <td>2014-10-13 13:07:30+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>> When I registered, phabricator linked mediaw...</td>\n",
|
||||
" <td>> When I registered, phabricator linked mediaw...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8617</th>\n",
|
||||
" <td>OAuth login refers to mediawiki.org:/ instead ...</td>\n",
|
||||
" <td>See {T574} for a related discussion.</td>\n",
|
||||
" <td>1412958953</td>\n",
|
||||
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-yeaxsfxhhtbn26koo5fi</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>378800</td>\n",
|
||||
" <td>378799.0</td>\n",
|
||||
" <td>2014-10-10 16:35:53+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>See {T574} for a related discussion.</td>\n",
|
||||
" <td>See {T574} for a related discussion.</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8618</th>\n",
|
||||
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
|
||||
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
|
||||
" <td>1412362816</td>\n",
|
||||
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
|
||||
" <td>task_description</td>\n",
|
||||
" <td>declined</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>378858</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2014-10-03 19:00:16+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
|
||||
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8619</th>\n",
|
||||
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
|
||||
" <td>I guess the same restrictions as in T543 apply...</td>\n",
|
||||
" <td>1412415111</td>\n",
|
||||
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>378860</td>\n",
|
||||
" <td>378859.0</td>\n",
|
||||
" <td>2014-10-04 09:31:51+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>I guess the same restrictions as in T543 apply...</td>\n",
|
||||
" <td>I guess the same restrictions as in T543 apply...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8620</th>\n",
|
||||
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
|
||||
" <td>It's not entirely trivial to change</td>\n",
|
||||
" <td>1412366627</td>\n",
|
||||
" <td>PHID-USER-fn7qnpccfbitivgtw2rt</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>378861</td>\n",
|
||||
" <td>378860.0</td>\n",
|
||||
" <td>2014-10-03 20:03:47+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>It's not entirely trivial to change</td>\n",
|
||||
" <td>It's not entirely trivial to change</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>13628 rows × 16 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" task_title \\\n",
|
||||
"0 User with unattached accounts unable to login ... \n",
|
||||
"1 User with unattached accounts unable to login ... \n",
|
||||
"2 User with unattached accounts unable to login ... \n",
|
||||
"3 User with unattached accounts unable to login ... \n",
|
||||
"4 User with unattached accounts unable to login ... \n",
|
||||
"... ... \n",
|
||||
"8616 OAuth login refers to mediawiki.org:/ instead ... \n",
|
||||
"8617 OAuth login refers to mediawiki.org:/ instead ... \n",
|
||||
"8618 Improvements to Wikimedia SUL login dialog UI:... \n",
|
||||
"8619 Improvements to Wikimedia SUL login dialog UI:... \n",
|
||||
"8620 Improvements to Wikimedia SUL login dialog UI:... \n",
|
||||
"\n",
|
||||
" comment_text date_created \\\n",
|
||||
"0 User:NickK reported in IRC that they're gettin... 1411541280 \n",
|
||||
"1 Revert has been deployed. 1411573104 \n",
|
||||
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... 1411572378 \n",
|
||||
"3 (In reply to Kunal Mehta (Legoktm) from commen... 1411545535 \n",
|
||||
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... 1411542640 \n",
|
||||
"... ... ... \n",
|
||||
"8616 > When I registered, phabricator linked mediaw... 1413205650 \n",
|
||||
"8617 See {T574} for a related discussion. 1412958953 \n",
|
||||
"8618 Some improvements to the Wikimedia SUL dialog:... 1412362816 \n",
|
||||
"8619 I guess the same restrictions as in T543 apply... 1412415111 \n",
|
||||
"8620 It's not entirely trivial to change 1412366627 \n",
|
||||
"\n",
|
||||
" speaker meta.affil \\\n",
|
||||
"0 PHID-USER-v7vgzvvcw7v2umf737ri False \n",
|
||||
"1 PHID-USER-v7vgzvvcw7v2umf737ri False \n",
|
||||
"2 PHID-USER-ynivjflmc2dcl6w5ut5v False \n",
|
||||
"3 PHID-USER-v7bwpq3rs3zdxegibdbh False \n",
|
||||
"4 PHID-USER-ynivjflmc2dcl6w5ut5v False \n",
|
||||
"... ... ... \n",
|
||||
"8616 PHID-USER-hgn5uw2jafgjgfvxibhh False \n",
|
||||
"8617 PHID-USER-lluzkul4z7us4sxkayss False \n",
|
||||
"8618 PHID-USER-lluzkul4z7us4sxkayss False \n",
|
||||
"8619 PHID-USER-lluzkul4z7us4sxkayss False \n",
|
||||
"8620 PHID-USER-fn7qnpccfbitivgtw2rt False \n",
|
||||
"\n",
|
||||
" conversation_id comment_type status meta.gerrit \\\n",
|
||||
"0 PHID-TASK-mio2uq45ny7mms72syut task_description resolved False \n",
|
||||
"1 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
|
||||
"2 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
|
||||
"3 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
|
||||
"4 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
|
||||
"... ... ... ... ... \n",
|
||||
"8616 PHID-TASK-yeaxsfxhhtbn26koo5fi task_subcomment NaN False \n",
|
||||
"8617 PHID-TASK-yeaxsfxhhtbn26koo5fi task_subcomment NaN False \n",
|
||||
"8618 PHID-TASK-j6czqxlv5fzcx3tmq23n task_description declined False \n",
|
||||
"8619 PHID-TASK-j6czqxlv5fzcx3tmq23n task_subcomment NaN False \n",
|
||||
"8620 PHID-TASK-j6czqxlv5fzcx3tmq23n task_subcomment NaN False \n",
|
||||
"\n",
|
||||
" id reply_to timestamp is_relevant is_migrated \\\n",
|
||||
"0 243215 NaN 2014-09-24 06:48:00+00:00 True False \n",
|
||||
"1 243216 243215.0 2014-09-24 15:38:24+00:00 True False \n",
|
||||
"2 243217 243216.0 2014-09-24 15:26:18+00:00 True False \n",
|
||||
"3 243218 243217.0 2014-09-24 07:58:55+00:00 True False \n",
|
||||
"4 243219 243218.0 2014-09-24 07:10:40+00:00 True False \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"8616 378799 378798.0 2014-10-13 13:07:30+00:00 True False \n",
|
||||
"8617 378800 378799.0 2014-10-10 16:35:53+00:00 True False \n",
|
||||
"8618 378858 NaN 2014-10-03 19:00:16+00:00 True False \n",
|
||||
"8619 378860 378859.0 2014-10-04 09:31:51+00:00 True False \n",
|
||||
"8620 378861 378860.0 2014-10-03 20:03:47+00:00 True False \n",
|
||||
"\n",
|
||||
" text \\\n",
|
||||
"0 User:NickK reported in IRC that they're gettin... \n",
|
||||
"1 Revert has been deployed. \n",
|
||||
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... \n",
|
||||
"3 (In reply to Kunal Mehta (Legoktm) from commen... \n",
|
||||
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... \n",
|
||||
"... ... \n",
|
||||
"8616 > When I registered, phabricator linked mediaw... \n",
|
||||
"8617 See {T574} for a related discussion. \n",
|
||||
"8618 Some improvements to the Wikimedia SUL dialog:... \n",
|
||||
"8619 I guess the same restrictions as in T543 apply... \n",
|
||||
"8620 It's not entirely trivial to change \n",
|
||||
"\n",
|
||||
" resolved_text \n",
|
||||
"0 User:NickK reported in IRC that they're gettin... \n",
|
||||
"1 Revert has been deployed. \n",
|
||||
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... \n",
|
||||
"3 (In reply to Kunal Mehta (Legoktm) from commen... \n",
|
||||
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... \n",
|
||||
"... ... \n",
|
||||
"8616 > When I registered, phabricator linked mediaw... \n",
|
||||
"8617 See {T574} for a related discussion. \n",
|
||||
"8618 Some improvements to the Wikimedia SUL dialog:... \n",
|
||||
"8619 I guess the same restrictions as in T543 apply... \n",
|
||||
"8620 It's not entirely trivial to change \n",
|
||||
"\n",
|
||||
"[13628 rows x 16 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"unique_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "0c392d70-6236-4dfe-b6d4-bbe3f422b09e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"unique_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0050825_coref-rel-first.csv\", index=False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,779 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
|
||||
" warnings.warn(\"Can't initialize NVML\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd \n",
|
||||
"import spacy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0422_http_phab_comments.csv\"\n",
|
||||
"phab_df = pd.read_csv(phab_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e30e81ad",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#because of compute issues, need to do the sampling before the coreference resolution\n",
|
||||
"def http_relevant(text):\n",
|
||||
" if pd.isnull(text):\n",
|
||||
" return False\n",
|
||||
" # expanded dictionary for relevancy\n",
|
||||
" # http, login, SSL, TLS, certificate \n",
|
||||
" for word in text.split():\n",
|
||||
" if \"://\" not in word.lower():\n",
|
||||
" #http\n",
|
||||
" if \"http\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #login\n",
|
||||
" if \"login\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #ssl\n",
|
||||
" if \"ssl\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #tls\n",
|
||||
" if \"tls\" in word.lower():\n",
|
||||
" return True\n",
|
||||
" #cert\n",
|
||||
" if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n",
|
||||
" return True\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
"def is_migrated(comment_text):\n",
|
||||
" if pd.isnull(comment_text):\n",
|
||||
" return False\n",
|
||||
" text = comment_text.strip()\n",
|
||||
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
|
||||
" return True \n",
|
||||
" return False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f359805f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:45: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
|
||||
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
|
||||
"\n",
|
||||
"#cleaning df\n",
|
||||
"phab_df['id'] = phab_df.index + 1\n",
|
||||
"#may have to build out the reply_to column \n",
|
||||
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
|
||||
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
|
||||
"\n",
|
||||
"phab_df = phab_df.rename(columns={\n",
|
||||
" 'AuthorPHID': 'speaker',\n",
|
||||
" 'TaskPHID': 'conversation_id',\n",
|
||||
" 'WMFaffil':'meta.affil',\n",
|
||||
" 'isGerrit': 'meta.gerrit'\n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# after 10-01-2014 before 10-01-2015\n",
|
||||
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
|
||||
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] >= 1412207999)]\n",
|
||||
"# after 07-01-2013 before 10-01-2015\n",
|
||||
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] > 1372636800)]\n",
|
||||
"\n",
|
||||
"#removing headless conversations\n",
|
||||
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
|
||||
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
|
||||
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
|
||||
"\n",
|
||||
"#removing gerrit comments \n",
|
||||
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
|
||||
"\n",
|
||||
"# filter out the sourceforge migration \n",
|
||||
"# Originally from: http://sourceforge.net in the task task_summary\n",
|
||||
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
|
||||
"\n",
|
||||
"#cut down to only the data that is relevant (mentions http)\n",
|
||||
"relevant_conversation_ids = task_phab_df[\n",
|
||||
" task_phab_df['comment_text'].apply(http_relevant) |\n",
|
||||
" task_phab_df['task_title'].apply(http_relevant)\n",
|
||||
"]['conversation_id'].unique()\n",
|
||||
"\n",
|
||||
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
||||
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
|
||||
"\n",
|
||||
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
||||
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
|
||||
"\n",
|
||||
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
|
||||
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
|
||||
"#comment_phab_df = mid_comment_phab_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "4241cb0a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>task_title</th>\n",
|
||||
" <th>comment_text</th>\n",
|
||||
" <th>date_created</th>\n",
|
||||
" <th>speaker</th>\n",
|
||||
" <th>meta.affil</th>\n",
|
||||
" <th>conversation_id</th>\n",
|
||||
" <th>comment_type</th>\n",
|
||||
" <th>status</th>\n",
|
||||
" <th>meta.gerrit</th>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>reply_to</th>\n",
|
||||
" <th>timestamp</th>\n",
|
||||
" <th>is_relevant</th>\n",
|
||||
" <th>is_migrated</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>197</th>\n",
|
||||
" <td>Creation of instances broken</td>\n",
|
||||
" <td>After a replace of old instances, it is not po...</td>\n",
|
||||
" <td>1442753295</td>\n",
|
||||
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||||
" <td>task_description</td>\n",
|
||||
" <td>resolved</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>198</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2015-09-20 12:48:15+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>198</th>\n",
|
||||
" <td>Creation of instances broken</td>\n",
|
||||
" <td>Works now.</td>\n",
|
||||
" <td>1442864673</td>\n",
|
||||
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>199</td>\n",
|
||||
" <td>198.0</td>\n",
|
||||
" <td>2015-09-21 19:44:33+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>199</th>\n",
|
||||
" <td>Creation of instances broken</td>\n",
|
||||
" <td>Ok, the instances are deleted now, I will recr...</td>\n",
|
||||
" <td>1442864271</td>\n",
|
||||
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>200</td>\n",
|
||||
" <td>199.0</td>\n",
|
||||
" <td>2015-09-21 19:37:51+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>200</th>\n",
|
||||
" <td>Creation of instances broken</td>\n",
|
||||
" <td>The new instances have the same names as recen...</td>\n",
|
||||
" <td>1442854156</td>\n",
|
||||
" <td>PHID-USER-22bsa5u75jz3ci3wnplu</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>201</td>\n",
|
||||
" <td>200.0</td>\n",
|
||||
" <td>2015-09-21 16:49:16+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>201</th>\n",
|
||||
" <td>Creation of instances broken</td>\n",
|
||||
" <td>This happens also with jessie and presice inst...</td>\n",
|
||||
" <td>1442835238</td>\n",
|
||||
" <td>PHID-USER-qlodcndtwpolbdhncjis</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-pitdrld6mszruqmc6usf</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>202</td>\n",
|
||||
" <td>201.0</td>\n",
|
||||
" <td>2015-09-21 11:33:58+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>406887</th>\n",
|
||||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||||
" <td>*** Bug 49454 has been marked as a duplicate o...</td>\n",
|
||||
" <td>1379011061</td>\n",
|
||||
" <td>PHID-USER-2nnm76h4ykalvvref2ye</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>406888</td>\n",
|
||||
" <td>406887.0</td>\n",
|
||||
" <td>2013-09-12 18:37:41+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>406888</th>\n",
|
||||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||||
" <td>JFTR, on Tools mosh-server processes eat up to...</td>\n",
|
||||
" <td>1376245807</td>\n",
|
||||
" <td>PHID-USER-vk6mlmacfhx77egryy5i</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>406889</td>\n",
|
||||
" <td>406888.0</td>\n",
|
||||
" <td>2013-08-11 18:30:07+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>406889</th>\n",
|
||||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||||
" <td>This is supported on tools, but adding it to t...</td>\n",
|
||||
" <td>1376185312</td>\n",
|
||||
" <td>PHID-USER-h75guknmwivm6x37iute</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>406890</td>\n",
|
||||
" <td>406889.0</td>\n",
|
||||
" <td>2013-08-11 01:41:52+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>406890</th>\n",
|
||||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||||
" <td>Just found out that mosh already works for too...</td>\n",
|
||||
" <td>1376118400</td>\n",
|
||||
" <td>PHID-USER-5dqihbanu3caaj7pigif</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>406891</td>\n",
|
||||
" <td>406890.0</td>\n",
|
||||
" <td>2013-08-10 07:06:40+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>406891</th>\n",
|
||||
" <td>Allow login using mosh as an alternative to pl...</td>\n",
|
||||
" <td>(In reply to comment #0)\\n> ssh is quite painf...</td>\n",
|
||||
" <td>1376118251</td>\n",
|
||||
" <td>PHID-USER-6vzzsmi22zem6yttr6vp</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>PHID-TASK-hnwvtmwgpm2oisoqaozt</td>\n",
|
||||
" <td>task_subcomment</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>406892</td>\n",
|
||||
" <td>406891.0</td>\n",
|
||||
" <td>2013-08-10 07:04:11+00:00</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>14490 rows × 14 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" task_title \\\n",
|
||||
"197 Creation of instances broken \n",
|
||||
"198 Creation of instances broken \n",
|
||||
"199 Creation of instances broken \n",
|
||||
"200 Creation of instances broken \n",
|
||||
"201 Creation of instances broken \n",
|
||||
"... ... \n",
|
||||
"406887 Allow login using mosh as an alternative to pl... \n",
|
||||
"406888 Allow login using mosh as an alternative to pl... \n",
|
||||
"406889 Allow login using mosh as an alternative to pl... \n",
|
||||
"406890 Allow login using mosh as an alternative to pl... \n",
|
||||
"406891 Allow login using mosh as an alternative to pl... \n",
|
||||
"\n",
|
||||
" comment_text date_created \\\n",
|
||||
"197 After a replace of old instances, it is not po... 1442753295 \n",
|
||||
"198 Works now. 1442864673 \n",
|
||||
"199 Ok, the instances are deleted now, I will recr... 1442864271 \n",
|
||||
"200 The new instances have the same names as recen... 1442854156 \n",
|
||||
"201 This happens also with jessie and presice inst... 1442835238 \n",
|
||||
"... ... ... \n",
|
||||
"406887 *** Bug 49454 has been marked as a duplicate o... 1379011061 \n",
|
||||
"406888 JFTR, on Tools mosh-server processes eat up to... 1376245807 \n",
|
||||
"406889 This is supported on tools, but adding it to t... 1376185312 \n",
|
||||
"406890 Just found out that mosh already works for too... 1376118400 \n",
|
||||
"406891 (In reply to comment #0)\\n> ssh is quite painf... 1376118251 \n",
|
||||
"\n",
|
||||
" speaker meta.affil \\\n",
|
||||
"197 PHID-USER-qlodcndtwpolbdhncjis False \n",
|
||||
"198 PHID-USER-qlodcndtwpolbdhncjis False \n",
|
||||
"199 PHID-USER-qlodcndtwpolbdhncjis False \n",
|
||||
"200 PHID-USER-22bsa5u75jz3ci3wnplu False \n",
|
||||
"201 PHID-USER-qlodcndtwpolbdhncjis False \n",
|
||||
"... ... ... \n",
|
||||
"406887 PHID-USER-2nnm76h4ykalvvref2ye False \n",
|
||||
"406888 PHID-USER-vk6mlmacfhx77egryy5i False \n",
|
||||
"406889 PHID-USER-h75guknmwivm6x37iute False \n",
|
||||
"406890 PHID-USER-5dqihbanu3caaj7pigif False \n",
|
||||
"406891 PHID-USER-6vzzsmi22zem6yttr6vp False \n",
|
||||
"\n",
|
||||
" conversation_id comment_type status \\\n",
|
||||
"197 PHID-TASK-pitdrld6mszruqmc6usf task_description resolved \n",
|
||||
"198 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
|
||||
"199 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
|
||||
"200 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
|
||||
"201 PHID-TASK-pitdrld6mszruqmc6usf task_subcomment NaN \n",
|
||||
"... ... ... ... \n",
|
||||
"406887 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||||
"406888 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||||
"406889 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||||
"406890 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||||
"406891 PHID-TASK-hnwvtmwgpm2oisoqaozt task_subcomment NaN \n",
|
||||
"\n",
|
||||
" meta.gerrit id reply_to timestamp is_relevant \\\n",
|
||||
"197 False 198 NaN 2015-09-20 12:48:15+00:00 True \n",
|
||||
"198 False 199 198.0 2015-09-21 19:44:33+00:00 True \n",
|
||||
"199 False 200 199.0 2015-09-21 19:37:51+00:00 True \n",
|
||||
"200 False 201 200.0 2015-09-21 16:49:16+00:00 True \n",
|
||||
"201 False 202 201.0 2015-09-21 11:33:58+00:00 True \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"406887 False 406888 406887.0 2013-09-12 18:37:41+00:00 True \n",
|
||||
"406888 False 406889 406888.0 2013-08-11 18:30:07+00:00 True \n",
|
||||
"406889 False 406890 406889.0 2013-08-11 01:41:52+00:00 True \n",
|
||||
"406890 False 406891 406890.0 2013-08-10 07:06:40+00:00 True \n",
|
||||
"406891 False 406892 406891.0 2013-08-10 07:04:11+00:00 True \n",
|
||||
"\n",
|
||||
" is_migrated \n",
|
||||
"197 False \n",
|
||||
"198 False \n",
|
||||
"199 False \n",
|
||||
"200 False \n",
|
||||
"201 False \n",
|
||||
"... ... \n",
|
||||
"406887 False \n",
|
||||
"406888 False \n",
|
||||
"406889 False \n",
|
||||
"406890 False \n",
|
||||
"406891 False \n",
|
||||
"\n",
|
||||
"[14490 rows x 14 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"comment_phab_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "930c4d9c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" This is separate from the ipykernel package so we can avoid doing imports until\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"862"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prior_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0050825_coref-rel-first.csv\"\n",
|
||||
"prior_df = pd.read_csv(prior_path)\n",
|
||||
"comment_phab_df['timestamp'] = pd.to_datetime(comment_phab_df['timestamp'], utc=True)\n",
|
||||
"prior_df['timestamp'] = pd.to_datetime(prior_df['timestamp'], utc=True)\n",
|
||||
"merged_df = comment_phab_df.merge(prior_df, how='outer', indicator=True)\n",
|
||||
"len(merged_df)\n",
|
||||
"only_in_comment_phab_df = merged_df[merged_df['_merge'] == 'left_only']\n",
|
||||
"len(only_in_comment_phab_df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x154d9952a7c0>"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nlp = spacy.load(\"en_core_web_trf\")\n",
|
||||
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
|
||||
"\n",
|
||||
"# use replace_listeners for the coref components\n",
|
||||
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
|
||||
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
|
||||
"\n",
|
||||
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
|
||||
"nlp.add_pipe(\"merge_entities\")\n",
|
||||
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
|
||||
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://github.com/explosion/spaCy/discussions/13572\n",
|
||||
"# https://github.com/explosion/spaCy/issues/13111 \n",
|
||||
"# https://explosion.ai/blog/coref\n",
|
||||
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
|
||||
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "424d35e0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
|
||||
"from spacy.tokens import Doc\n",
|
||||
"# Define lightweight function for resolving references in text\n",
|
||||
"def resolve_references(doc: Doc) -> str:\n",
|
||||
" \"\"\"Function for resolving references with the coref ouput\n",
|
||||
" doc (Doc): The Doc object processed by the coref pipeline\n",
|
||||
" RETURNS (str): The Doc string with resolved references\n",
|
||||
" \"\"\"\n",
|
||||
" # token.idx : token.text\n",
|
||||
" token_mention_mapper = {}\n",
|
||||
" output_string = \"\"\n",
|
||||
" clusters = [\n",
|
||||
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # Iterate through every found cluster\n",
|
||||
" for cluster in clusters:\n",
|
||||
" first_mention = cluster[0]\n",
|
||||
" # Iterate through every other span in the cluster\n",
|
||||
" for mention_span in list(cluster)[1:]:\n",
|
||||
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
|
||||
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
|
||||
" \n",
|
||||
" for token in mention_span[1:]:\n",
|
||||
" # Set empty string for all the other tokens in mention_span\n",
|
||||
" token_mention_mapper[token.idx] = \"\"\n",
|
||||
"\n",
|
||||
" # Iterate through every token in the Doc\n",
|
||||
" for token in doc:\n",
|
||||
" # Check if token exists in token_mention_mapper\n",
|
||||
" if token.idx in token_mention_mapper:\n",
|
||||
" output_string += token_mention_mapper[token.idx]\n",
|
||||
" # Else add original token text\n",
|
||||
" else:\n",
|
||||
" output_string += token.text + token.whitespace_\n",
|
||||
"\n",
|
||||
" return output_string\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def resolving_comment(text):\n",
|
||||
" doc = nlp(text)\n",
|
||||
" resolved_text = resolve_references(doc)\n",
|
||||
" return resolved_text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'i hate ve.interface, ve.interface always messes up i browser'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" \"\"\"Entry point for launching an IPython kernel.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"only_in_comment_phab_df['text'] = only_in_comment_phab_df['comment_text'].apply(str)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "79e3f7e2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors\n",
|
||||
"/gscratch/scrubbed/mjilg/envs/coref-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" \"\"\"Entry point for launching an IPython kernel.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"only_in_comment_phab_df['resolved_text'] = only_in_comment_phab_df['text'].apply(resolving_comment)\n",
|
||||
"only_in_comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"7423 [Backport was merged into 1.24wmf16 upon a tim...\n",
|
||||
"7902 I guess this can be closed now as RESOLVED WOR...\n",
|
||||
"7905 The upstream issue is https://github.com/jcgre...\n",
|
||||
"7906 An update on this. In Amsterdam we found at th...\n",
|
||||
"7907 Yes. It's used by people using pywikibot-as-a-...\n",
|
||||
" ... \n",
|
||||
"14465 I amended the title to the range IE8-10 becaus...\n",
|
||||
"14466 If I remember correctly this problem was at le...\n",
|
||||
"14467 If I remember correctly this problem was at le...\n",
|
||||
"14468 After a quick test, autocomplete seems to work...\n",
|
||||
"14478 Still not merged, so we can't really do much.\n",
|
||||
"Name: resolved_text, Length: 862, dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"only_in_comment_phab_df['resolved_text']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "92bf47ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"only_in_comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv\", index=False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|