148 lines
5.8 KiB
Python
148 lines
5.8 KiB
Python
import pandas as pd
|
|
import spacy
|
|
from spacy.tokens import Doc
|
|
|
|
#because of compute issues, need to do the sampling before the coreference resolution
|
|
def http_relevant(text):
|
|
if pd.isnull(text):
|
|
return False
|
|
# expanded dictionary for relevancy
|
|
# http, login, SSL, TLS, certificate
|
|
for word in text.split():
|
|
if "://" not in word.lower():
|
|
#http
|
|
if "http" in word.lower():
|
|
return True
|
|
#login
|
|
if "login" in word.lower():
|
|
return True
|
|
#ssl
|
|
if "ssl" in word.lower():
|
|
return True
|
|
#tls
|
|
if "tls" in word.lower():
|
|
return True
|
|
#cert
|
|
if word.lower().startswith("cert"):
|
|
return True
|
|
return False
|
|
|
|
def resolving_comment(text):
|
|
doc = nlp(text)
|
|
resolved_text = resolve_references(doc)
|
|
return resolved_text
|
|
|
|
# Define lightweight function for resolving references in text
|
|
def resolve_references(doc: Doc) -> str:
|
|
"""Function for resolving references with the coref ouput
|
|
doc (Doc): The Doc object processed by the coref pipeline
|
|
RETURNS (str): The Doc string with resolved references
|
|
"""
|
|
# token.idx : token.text
|
|
token_mention_mapper = {}
|
|
output_string = ""
|
|
clusters = [
|
|
val for key, val in doc.spans.items() if key.startswith("coref_cluster")
|
|
]
|
|
|
|
# Iterate through every found cluster
|
|
for cluster in clusters:
|
|
first_mention = cluster[0]
|
|
# Iterate through every other span in the cluster
|
|
for mention_span in list(cluster)[1:]:
|
|
# Set first_mention as value for the first token in mention_span in the token_mention_mapper
|
|
token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
|
|
|
|
for token in mention_span[1:]:
|
|
# Set empty string for all the other tokens in mention_span
|
|
token_mention_mapper[token.idx] = ""
|
|
|
|
# Iterate through every token in the Doc
|
|
for token in doc:
|
|
# Check if token exists in token_mention_mapper
|
|
if token.idx in token_mention_mapper:
|
|
output_string += token_mention_mapper[token.idx]
|
|
# Else add original token text
|
|
else:
|
|
output_string += token.text + token.whitespace_
|
|
|
|
return output_string
|
|
|
|
|
|
def is_migrated(comment_text):
|
|
if pd.isnull(comment_text):
|
|
return False
|
|
text = comment_text.strip()
|
|
if text.startswith("Originally from: http://sourceforge.net"):
|
|
return True
|
|
return False
|
|
|
|
def main():
|
|
phab_path = "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0415_http_phab_comments.csv"
|
|
phab_df = pd.read_csv(phab_path)
|
|
|
|
#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb
|
|
phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'
|
|
|
|
#cleaning df
|
|
phab_df['id'] = phab_df.index + 1
|
|
#may have to build out the reply_to column
|
|
phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()
|
|
phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)
|
|
|
|
phab_df = phab_df.rename(columns={
|
|
'AuthorPHID': 'speaker',
|
|
'TaskPHID': 'conversation_id',
|
|
'WMFaffil':'meta.affil',
|
|
'isGerrit': 'meta.gerrit'
|
|
})
|
|
|
|
# after 04-01-2015 before 10-1-2015
|
|
phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)
|
|
filtered_phab_df = phab_df[(phab_df['date_created'] < 1443657600) & (phab_df['date_created'] > 1427846400)]
|
|
#filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]
|
|
|
|
#removing headless conversations
|
|
task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']=="task_description"]
|
|
headed_task_phids = task_phab_df['conversation_id'].unique()
|
|
filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]
|
|
|
|
#removing gerrit comments
|
|
mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]
|
|
|
|
# filter out the sourceforge migration
|
|
# Originally from: http://sourceforge.net in the task task_summary
|
|
migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()
|
|
|
|
#cut down to only the data that is relevant (mentions http)
|
|
relevant_conversation_ids = task_phab_df[
|
|
task_phab_df['comment_text'].apply(http_relevant) |
|
|
task_phab_df['task_title'].apply(http_relevant)
|
|
]['conversation_id'].unique()
|
|
|
|
task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)
|
|
mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)
|
|
|
|
task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)
|
|
mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)
|
|
|
|
comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True)
|
|
& (mid_comment_phab_df['is_migrated'] != True)]
|
|
|
|
task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]
|
|
#comment_phab_df = mid_comment_phab_df
|
|
|
|
nlp = spacy.load("en_core_web_trf")
|
|
nlp_coref = spacy.load("en_coreference_web_trf")
|
|
|
|
# use replace_listeners for the coref components
|
|
nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
|
|
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])
|
|
|
|
# we won't copy over the span cleaner - this keeps the head cluster information, which we want
|
|
nlp.add_pipe("merge_entities")
|
|
nlp.add_pipe("coref", source=nlp_coref)
|
|
nlp.add_pipe("span_resolver", source=nlp_coref)
|
|
|
|
|
|
|