1
0
mw-lifecycle-analysis/text_analysis/coreference_resolution.py
2025-04-21 09:14:17 -07:00

158 lines
6.2 KiB
Python

import pandas as pd
import spacy
from spacy.tokens import Doc
nlp = spacy.load("en_core_web_trf")
nlp_coref = spacy.load("en_coreference_web_trf")
print('spacy models loaded')
# use replace_listeners for the coref components
nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])
# we won't copy over the span cleaner - this keeps the head cluster information, which we want
nlp.add_pipe("merge_entities")
nlp.add_pipe("coref", source=nlp_coref)
nlp.add_pipe("span_resolver", source=nlp_coref)
#because of compute issues, need to do the sampling before the coreference resolution
def http_relevant(text):
if pd.isnull(text):
return False
# expanded dictionary for relevancy
# http, login, SSL, TLS, certificate
for word in text.split():
if "://" not in word.lower():
#http
if "http" in word.lower():
return True
#login
if "login" in word.lower():
return True
#ssl
if "ssl" in word.lower():
return True
#tls
if "tls" in word.lower():
return True
#cert
if word.lower().startswith("cert"):
return True
return False
def resolving_comment(text):
doc = nlp(text)
resolved_text = resolve_references(doc)
return resolved_text
# Define lightweight function for resolving references in text
def resolve_references(doc: Doc) -> str:
"""Function for resolving references with the coref ouput
doc (Doc): The Doc object processed by the coref pipeline
RETURNS (str): The Doc string with resolved references
"""
# token.idx : token.text
token_mention_mapper = {}
output_string = ""
clusters = [
val for key, val in doc.spans.items() if key.startswith("coref_cluster")
]
# Iterate through every found cluster
for cluster in clusters:
first_mention = cluster[0]
# Iterate through every other span in the cluster
for mention_span in list(cluster)[1:]:
# Set first_mention as value for the first token in mention_span in the token_mention_mapper
token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
for token in mention_span[1:]:
# Set empty string for all the other tokens in mention_span
token_mention_mapper[token.idx] = ""
# Iterate through every token in the Doc
for token in doc:
# Check if token exists in token_mention_mapper
if token.idx in token_mention_mapper:
output_string += token_mention_mapper[token.idx]
# Else add original token text
else:
output_string += token.text + token.whitespace_
return output_string
def is_migrated(comment_text):
if pd.isnull(comment_text):
return False
text = comment_text.strip()
if text.startswith("Originally from: http://sourceforge.net"):
return True
return False
def main():
phab_path = "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0415_http_phab_comments.csv"
phab_df = pd.read_csv(phab_path)
#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb
phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'
#cleaning df
phab_df['id'] = phab_df.index + 1
#may have to build out the reply_to column
phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()
phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)
phab_df = phab_df.rename(columns={
'AuthorPHID': 'speaker',
'TaskPHID': 'conversation_id',
'WMFaffil':'meta.affil',
'isGerrit': 'meta.gerrit'
})
# after 04-01-2015 before 10-1-2015
phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)
filtered_phab_df = phab_df[(phab_df['date_created'] < 1443657600) & (phab_df['date_created'] > 1427846400)]
#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]
#removing headless conversations
task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']=="task_description"]
headed_task_phids = task_phab_df['conversation_id'].unique()
filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]
#removing gerrit comments
mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]
# filter out the sourceforge migration
# Originally from: http://sourceforge.net in the task task_summary
migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()
#cut down to only the data that is relevant (mentions http)
relevant_conversation_ids = task_phab_df[
task_phab_df['comment_text'].apply(http_relevant) |
task_phab_df['task_title'].apply(http_relevant)
]['conversation_id'].unique()
task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)
mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)
task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)
mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)
comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True)
& (mid_comment_phab_df['is_migrated'] != True)]
task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]
#comment_phab_df = mid_comment_phab_df
print("about to resolve the pronouns")
comment_phab_df['text'] = comment_phab_df['comment_text'].apply(str)
comment_phab_df['resolved_text'] = comment_phab_df['text'].apply(resolving_comment)
comment_phab_df.to_csv("/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/041525_coref_rel_phab_comments.csv", index=False)
if __name__ == "__main__":
main()