1
0

c3 initial phab

This commit is contained in:
Matthew Gaughan 2025-04-15 16:41:05 -07:00
parent 1c6fc80b66
commit 8b3c40d2a2
16 changed files with 50815 additions and 13 deletions

View File

@ -74,3 +74,11 @@ ls
rm event_0403_mediawiki_core_weekly_commit_count_data.csv rm event_0403_mediawiki_core_weekly_commit_count_data.csv
rm event_0403_mediawiki_wmfconfig_weekly_commit_count_data.csv rm event_0403_mediawiki_wmfconfig_weekly_commit_count_data.csv
ls ls
cd ..
ls
cd case3
ls
mv core_2010-01-01_to_2024-12-31.csv mediawiki_core.csv
ls
mv mediawiki_core.csv mediawiki_core_commits.csv
ls

View File

@ -5,7 +5,7 @@ library(tidyr)
library(purrr) library(purrr)
library(stringr) library(stringr)
https_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/mediawiki_core_commits.csv" https_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/mediawiki_core_commits.csv"
contains_http_but_not_url <- function(text) { contains_http_but_not_url <- function(text) {
if (is.na(text)) { if (is.na(text)) {
@ -45,7 +45,7 @@ transform_commit_data <- function(filepath){
# TODO: this is project/event specific # TODO: this is project/event specific
event_date <- as.Date("2013-08-28") event_date <- as.Date("2015-07-02")
#event_date <- as.Date("2013-07-01") #event_date <- as.Date("2013-07-01")
#event_date <- as.Date("2013-04-25") #event_date <- as.Date("2013-04-25")
#event_date <- as.Date("2012-12-11") #event_date <- as.Date("2012-12-11")
@ -67,6 +67,10 @@ transform_commit_data <- function(filepath){
df <- df |> df <- df |>
mutate(age = project_age) mutate(age = project_age)
#drop out data from ''before'' the release process
df <- df |>
filter(commit_date >= as.Date("2015-04-01"))
#we are looking at weekly data, 6m before and 6m after #we are looking at weekly data, 6m before and 6m after
#start_date <- event_date %m-% months(6) #start_date <- event_date %m-% months(6)
calculated_start_date <- event_date %m-% months(12) calculated_start_date <- event_date %m-% months(12)
@ -92,7 +96,7 @@ transform_commit_data <- function(filepath){
# list all author_emails with >5 commits # list all author_emails with >5 commits
# for big df: if author not in the list, 'new' author # for big df: if author not in the list, 'new' author
old_author_list <- df |> old_author_list <- df |>
filter(commit_date < as.Date("2013-08-01"))|> filter(commit_date < as.Date("2015-06-01"))|>
group_by(author_email) |> group_by(author_email) |>
summarise(commit_count = n()) |> summarise(commit_count = n()) |>
filter(commit_count > 5) |> filter(commit_count > 5) |>
@ -188,7 +192,7 @@ transform_commit_data <- function(filepath){
# ) |> # ) |>
weekly_commits <- weekly_commits |> weekly_commits <- weekly_commits |>
filter(relative_week >= (-52) & relative_week <= 52 ) filter(relative_week >= (-14) & relative_week <= 52 )
#gracefully exit #gracefully exit
return(weekly_commits) return(weekly_commits)
@ -203,7 +207,7 @@ transform_relevant_commit_data <- function(filepath){
# TODO: this is project/event specific # TODO: this is project/event specific
event_date <- as.Date("2013-08-28") event_date <- as.Date("2015-07-02")
#event_date <- as.Date("2013-07-01") #event_date <- as.Date("2013-07-01")
#event_date <- as.Date("2013-04-25") #event_date <- as.Date("2013-04-25")
#event_date <- as.Date("2012-12-11") #event_date <- as.Date("2012-12-11")
@ -225,6 +229,10 @@ transform_relevant_commit_data <- function(filepath){
df <- df |> df <- df |>
mutate(age = project_age) mutate(age = project_age)
#drop out data from ''before'' the release process
df <- df |>
filter(commit_date >= as.Date("2015-04-01"))
#we are looking at weekly data, 6m before and 6m after #we are looking at weekly data, 6m before and 6m after
#start_date <- event_date %m-% months(6) #start_date <- event_date %m-% months(6)
calculated_start_date <- event_date %m-% months(12) calculated_start_date <- event_date %m-% months(12)
@ -247,12 +255,12 @@ transform_relevant_commit_data <- function(filepath){
# new_author_unaff = if_else(!grepl("@wikimedia", author_email), new_author, 0)) |> # new_author_unaff = if_else(!grepl("@wikimedia", author_email), new_author, 0)) |>
# ungroup() # ungroup()
# cut the df to all before 06-01-2013 # cut the df to all before 06-01-2015
# group by author_email # group by author_email
# list all author_emails with >5 commits # list all author_emails with >5 commits
# for big df: if author not in the list, 'new' author # for big df: if author not in the list, 'new' author
old_author_list <- df |> old_author_list <- df |>
filter(commit_date < as.Date("2013-08-01"))|> filter(commit_date < as.Date("2015-06-01"))|>
group_by(author_email) |> group_by(author_email) |>
summarise(commit_count = n()) |> summarise(commit_count = n()) |>
filter(commit_count > 5) |> filter(commit_count > 5) |>
@ -349,14 +357,14 @@ transform_relevant_commit_data <- function(filepath){
# ) |> # ) |>
weekly_commits <- weekly_commits |> weekly_commits <- weekly_commits |>
filter(relative_week >= (-52) & relative_week <= 52 ) filter(relative_week >= (-14) & relative_week <= 52 )
#gracefully exit #gracefully exit
return(weekly_commits) return(weekly_commits)
} }
transformed <- transform_relevant_commit_data(https_commit_fp) transformed <- transform_commit_data(https_commit_fp)
output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/relevant_event_0413_mediawiki_core_weekly_commit_count_data.csv" output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/event_0415_mediawiki_core_weekly_commit_count_data.csv"
write.csv(transformed, output_filepath, row.names = FALSE) write.csv(transformed, output_filepath, row.names = FALSE)

View File

@ -1,5 +1,5 @@
library(tidyverse) library(tidyverse)
count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/relevant_event_0413_mediawiki_core_weekly_commit_count_data.csv" count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/event_0415_mediawiki_core_weekly_commit_count_data.csv"
input_df <- read.csv(count_data_fp, header = TRUE) input_df <- read.csv(count_data_fp, header = TRUE)
input_df$nonbot_commit_count <- input_df$commit_count - input_df$bot_commit_count input_df$nonbot_commit_count <- input_df$commit_count - input_df$bot_commit_count
@ -27,7 +27,7 @@ new_authors <- long_df |>
labels = c("nonbot_commit_count" = "Total Nonbot Commits", labels = c("nonbot_commit_count" = "Total Nonbot Commits",
"unaff_new_commit_count" = "New Unaffiliated Commits", "unaff_new_commit_count" = "New Unaffiliated Commits",
"wmf_new_commit_count" = "New WMF Commits")) + "wmf_new_commit_count" = "New WMF Commits")) +
ggtitle("relevant MW-core Commits Around HTTPS as-default ('New' contributors <= 5 commits before 08-01-2013)") + ggtitle("Total MW-core Commits Around HTTP-deprecation ('New' contributors <= 5 commits 04-01-2015 and 06-01-2015)") +
theme_bw() + theme_bw() +
theme(legend.position = "top") theme(legend.position = "top")
new_authors new_authors
@ -72,7 +72,7 @@ commit_share_plot <- share_long |>
geom_point() + geom_point() +
labs(x = "Relative Week", y = "Share of Nonbot Commits", color="Commit Author Affiliation") + labs(x = "Relative Week", y = "Share of Nonbot Commits", color="Commit Author Affiliation") +
scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) + scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) +
ggtitle("MW-core Nonbot 'relevant' Commit Share Around HTTPS-as-default") + ggtitle("MW-core Nonbot Total Commit Share Around HTTP-deprecation") +
theme_bw() + theme_bw() +
theme(legend.position = "top") theme(legend.position = "top")
commit_share_plot commit_share_plot

View File

@ -0,0 +1,18 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:34533 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: kgAQecq+G8pssOaH78tv
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 25397730
slurmstepd: error: *** JOB 25397730 ON n3439 CANCELLED AT 2025-04-15T15:37:51 ***

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,393 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import pandas as pd \n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/0402_https1_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e30e81ad",
"metadata": {},
"outputs": [],
"source": [
"#because of compute issues, need to do the sampling before the coreference resolution\n",
"def http_relevant(text):\n",
" if pd.isnull(text):\n",
" return False\n",
" # expanded dictionary for relevancy\n",
" # http, login, SSL, TLS, certificate \n",
" for word in text.split():\n",
" if \"://\" not in word.lower():\n",
" #http\n",
" if \"http\" in word.lower():\n",
" return True\n",
" #login\n",
" if \"login\" in word.lower():\n",
" return True\n",
" #ssl\n",
" if \"ssl\" in word.lower():\n",
" return True\n",
" #tls\n",
" if \"tls\" in word.lower():\n",
" return True\n",
" #cert\n",
" if word.lower().startswith(\"cert\"):\n",
" return True\n",
" return False\n",
"\n",
"def is_migrated(comment_text):\n",
" if pd.isnull(comment_text):\n",
" return False\n",
" text = comment_text.strip()\n",
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
" return True \n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f359805f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
}
],
"source": [
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
"\n",
"#cleaning df\n",
"phab_df['id'] = phab_df.index + 1\n",
"#may have to build out the reply_to column \n",
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
"\n",
"phab_df = phab_df.rename(columns={\n",
" 'AuthorPHID': 'speaker',\n",
" 'TaskPHID': 'conversation_id',\n",
" 'WMFaffil':'meta.affil',\n",
" 'isGerrit': 'meta.gerrit'\n",
"})\n",
"\n",
"# after 12-1-2012 before 12-1-2013\n",
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1385856000) & (phab_df['date_created'] > 1354320000)]\n",
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]\n",
"\n",
"#removing headless conversations\n",
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
"\n",
"#removing gerrit comments \n",
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
"\n",
"# filter out the sourceforge migration \n",
"# Originally from: http://sourceforge.net in the task task_summary\n",
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
"\n",
"#cut down to only the data that is relevant (mentions http)\n",
"relevant_conversation_ids = task_phab_df[\n",
" task_phab_df['comment_text'].apply(http_relevant) |\n",
" task_phab_df['task_title'].apply(http_relevant)\n",
"]['conversation_id'].unique()\n",
"\n",
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"\n",
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"\n",
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
"#comment_phab_df = mid_comment_phab_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x14ba49228520>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp = spacy.load(\"en_core_web_trf\")\n",
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
"\n",
"# use replace_listeners for the coref components\n",
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
"\n",
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
"nlp.add_pipe(\"merge_entities\")\n",
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/explosion/spaCy/discussions/13572\n",
"# https://github.com/explosion/spaCy/issues/13111 \n",
"# https://explosion.ai/blog/coref\n",
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "424d35e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
"source": [
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"from spacy.tokens import Doc\n",
"# Define lightweight function for resolving references in text\n",
"def resolve_references(doc: Doc) -> str:\n",
" \"\"\"Function for resolving references with the coref ouput\n",
" doc (Doc): The Doc object processed by the coref pipeline\n",
" RETURNS (str): The Doc string with resolved references\n",
" \"\"\"\n",
" # token.idx : token.text\n",
" token_mention_mapper = {}\n",
" output_string = \"\"\n",
" clusters = [\n",
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
" ]\n",
"\n",
" # Iterate through every found cluster\n",
" for cluster in clusters:\n",
" first_mention = cluster[0]\n",
" # Iterate through every other span in the cluster\n",
" for mention_span in list(cluster)[1:]:\n",
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
" \n",
" for token in mention_span[1:]:\n",
" # Set empty string for all the other tokens in mention_span\n",
" token_mention_mapper[token.idx] = \"\"\n",
"\n",
" # Iterate through every token in the Doc\n",
" for token in doc:\n",
" # Check if token exists in token_mention_mapper\n",
" if token.idx in token_mention_mapper:\n",
" output_string += token_mention_mapper[token.idx]\n",
" # Else add original token text\n",
" else:\n",
" output_string += token.text + token.whitespace_\n",
"\n",
" return output_string\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
"source": [
"def resolving_comment(text):\n",
" doc = nlp(text)\n",
" resolved_text = resolve_references(doc)\n",
" return resolved_text"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \n"
]
}
],
"source": [
"comment_phab_df['text'] = comment_phab_df['comment_text'].apply(str)\n",
"comment_phab_df['resolved_text'] = comment_phab_df['text'].apply(resolving_comment)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "46088",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3360\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3361\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 46088",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_61233/1116300830.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcomment_phab_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'resolved_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m46088\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 940\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 941\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mkey_is_scalar\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 942\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 943\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 944\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_hashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m_get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1049\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1050\u001b[0m \u001b[0;31m# Similar to Index.get_value, but we do not fall back to positional\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1051\u001b[0;31m \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1052\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_values_for_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1053\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3361\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3365\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhasnans\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 46088"
]
}
],
"source": [
"comment_phab_df['resolved_text'][46088]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "92bf47ae",
"metadata": {},
"outputs": [],
"source": [
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/041325_coref_rel_phab_comments.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,393 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b270bd36-529e-4595-a780-ef6c8151c31f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
" warnings.warn(\"Can't initialize NVML\")\n"
]
}
],
"source": [
"import pandas as pd \n",
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {},
"outputs": [],
"source": [
"phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/0402_https1_phab_comments.csv\"\n",
"phab_df = pd.read_csv(phab_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e30e81ad",
"metadata": {},
"outputs": [],
"source": [
"#because of compute issues, need to do the sampling before the coreference resolution\n",
"def http_relevant(text):\n",
" if pd.isnull(text):\n",
" return False\n",
" # expanded dictionary for relevancy\n",
" # http, login, SSL, TLS, certificate \n",
" for word in text.split():\n",
" if \"://\" not in word.lower():\n",
" #http\n",
" if \"http\" in word.lower():\n",
" return True\n",
" #login\n",
" if \"login\" in word.lower():\n",
" return True\n",
" #ssl\n",
" if \"ssl\" in word.lower():\n",
" return True\n",
" #tls\n",
" if \"tls\" in word.lower():\n",
" return True\n",
" #cert\n",
" if word.lower().startswith(\"cert\"):\n",
" return True\n",
" return False\n",
"\n",
"def is_migrated(comment_text):\n",
" if pd.isnull(comment_text):\n",
" return False\n",
" text = comment_text.strip()\n",
" if text.startswith(\"Originally from: http://sourceforge.net\"):\n",
" return True \n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f359805f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
}
],
"source": [
"#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb\n",
"phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'\n",
"\n",
"#cleaning df\n",
"phab_df['id'] = phab_df.index + 1\n",
"#may have to build out the reply_to column \n",
"phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()\n",
"phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)\n",
"\n",
"phab_df = phab_df.rename(columns={\n",
" 'AuthorPHID': 'speaker',\n",
" 'TaskPHID': 'conversation_id',\n",
" 'WMFaffil':'meta.affil',\n",
" 'isGerrit': 'meta.gerrit'\n",
"})\n",
"\n",
"# after 12-1-2012 before 12-1-2013\n",
"phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n",
"filtered_phab_df = phab_df[(phab_df['date_created'] < 1385856000) & (phab_df['date_created'] > 1354320000)]\n",
"#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]\n",
"\n",
"#removing headless conversations\n",
"task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']==\"task_description\"]\n",
"headed_task_phids = task_phab_df['conversation_id'].unique()\n",
"filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]\n",
"\n",
"#removing gerrit comments \n",
"mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]\n",
"\n",
"# filter out the sourceforge migration \n",
"# Originally from: http://sourceforge.net in the task task_summary\n",
"migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()\n",
"\n",
"#cut down to only the data that is relevant (mentions http)\n",
"relevant_conversation_ids = task_phab_df[\n",
" task_phab_df['comment_text'].apply(http_relevant) |\n",
" task_phab_df['task_title'].apply(http_relevant)\n",
"]['conversation_id'].unique()\n",
"\n",
"task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"\n",
"task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)\n",
"\n",
"comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]\n",
"task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]\n",
"#comment_phab_df = mid_comment_phab_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x14ba49228520>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp = spacy.load(\"en_core_web_trf\")\n",
"nlp_coref = spacy.load(\"en_coreference_web_trf\")\n",
"\n",
"# use replace_listeners for the coref components\n",
"nlp_coref.replace_listeners(\"transformer\", \"coref\", [\"model.tok2vec\"])\n",
"nlp_coref.replace_listeners(\"transformer\", \"span_resolver\", [\"model.tok2vec\"])\n",
"\n",
"# we won't copy over the span cleaner - this keeps the head cluster information, which we want\n",
"nlp.add_pipe(\"merge_entities\")\n",
"nlp.add_pipe(\"coref\", source=nlp_coref)\n",
"nlp.add_pipe(\"span_resolver\", source=nlp_coref)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/explosion/spaCy/discussions/13572\n",
"# https://github.com/explosion/spaCy/issues/13111 \n",
"# https://explosion.ai/blog/coref\n",
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"doc = nlp(\"John is frustrated with the VisualEditor project, he thinks it doesn't work.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "424d35e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {},
"outputs": [],
"source": [
"# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466\n",
"from spacy.tokens import Doc\n",
"# Define lightweight function for resolving references in text\n",
"def resolve_references(doc: Doc) -> str:\n",
" \"\"\"Function for resolving references with the coref ouput\n",
" doc (Doc): The Doc object processed by the coref pipeline\n",
" RETURNS (str): The Doc string with resolved references\n",
" \"\"\"\n",
" # token.idx : token.text\n",
" token_mention_mapper = {}\n",
" output_string = \"\"\n",
" clusters = [\n",
" val for key, val in doc.spans.items() if key.startswith(\"coref_cluster\")\n",
" ]\n",
"\n",
" # Iterate through every found cluster\n",
" for cluster in clusters:\n",
" first_mention = cluster[0]\n",
" # Iterate through every other span in the cluster\n",
" for mention_span in list(cluster)[1:]:\n",
" # Set first_mention as value for the first token in mention_span in the token_mention_mapper\n",
" token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_\n",
" \n",
" for token in mention_span[1:]:\n",
" # Set empty string for all the other tokens in mention_span\n",
" token_mention_mapper[token.idx] = \"\"\n",
"\n",
" # Iterate through every token in the Doc\n",
" for token in doc:\n",
" # Check if token exists in token_mention_mapper\n",
" if token.idx in token_mention_mapper:\n",
" output_string += token_mention_mapper[token.idx]\n",
" # Else add original token text\n",
" else:\n",
" output_string += token.text + token.whitespace_\n",
"\n",
" return output_string\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {},
"outputs": [],
"source": [
"def resolving_comment(text):\n",
" doc = nlp(text)\n",
" resolved_text = resolve_references(doc)\n",
" return resolved_text"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'i hate ve.interface, ve.interface always messes up i browser'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resolving_comment(\"i hate ve.interface, it always messes up my browser\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "46873641-8e88-4829-9e24-4dd5e6749bd1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n",
"/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \n"
]
}
],
"source": [
"comment_phab_df['text'] = comment_phab_df['comment_text'].apply(str)\n",
"comment_phab_df['resolved_text'] = comment_phab_df['text'].apply(resolving_comment)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "46088",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3360\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3361\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 46088",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_61233/1116300830.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcomment_phab_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'resolved_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m46088\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 940\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 941\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mkey_is_scalar\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 942\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 943\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 944\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_hashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m_get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1049\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1050\u001b[0m \u001b[0;31m# Similar to Index.get_value, but we do not fall back to positional\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1051\u001b[0;31m \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1052\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_values_for_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1053\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3361\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3365\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhasnans\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 46088"
]
}
],
"source": [
"comment_phab_df['resolved_text'][46088]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "92bf47ae",
"metadata": {},
"outputs": [],
"source": [
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/041325_coref_rel_phab_comments.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long