import pandas as pd import spacy from spacy.tokens import Doc #because of compute issues, need to do the sampling before the coreference resolution def http_relevant(text): if pd.isnull(text): return False # expanded dictionary for relevancy # http, login, SSL, TLS, certificate for word in text.split(): if "://" not in word.lower(): #http if "http" in word.lower(): return True #login if "login" in word.lower(): return True #ssl if "ssl" in word.lower(): return True #tls if "tls" in word.lower(): return True #cert if word.lower().startswith("cert"): return True return False def resolving_comment(text): doc = nlp(text) resolved_text = resolve_references(doc) return resolved_text # Define lightweight function for resolving references in text def resolve_references(doc: Doc) -> str: """Function for resolving references with the coref ouput doc (Doc): The Doc object processed by the coref pipeline RETURNS (str): The Doc string with resolved references """ # token.idx : token.text token_mention_mapper = {} output_string = "" clusters = [ val for key, val in doc.spans.items() if key.startswith("coref_cluster") ] # Iterate through every found cluster for cluster in clusters: first_mention = cluster[0] # Iterate through every other span in the cluster for mention_span in list(cluster)[1:]: # Set first_mention as value for the first token in mention_span in the token_mention_mapper token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_ for token in mention_span[1:]: # Set empty string for all the other tokens in mention_span token_mention_mapper[token.idx] = "" # Iterate through every token in the Doc for token in doc: # Check if token exists in token_mention_mapper if token.idx in token_mention_mapper: output_string += token_mention_mapper[token.idx] # Else add original token text else: output_string += token.text + token.whitespace_ return output_string def is_migrated(comment_text): if pd.isnull(comment_text): return False text = comment_text.strip() if text.startswith("Originally from: http://sourceforge.net"): return True return False def main(): phab_path = "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0415_http_phab_comments.csv" phab_df = pd.read_csv(phab_path) #find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb' #cleaning df phab_df['id'] = phab_df.index + 1 #may have to build out the reply_to column phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift() phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None) phab_df = phab_df.rename(columns={ 'AuthorPHID': 'speaker', 'TaskPHID': 'conversation_id', 'WMFaffil':'meta.affil', 'isGerrit': 'meta.gerrit' }) # after 04-01-2015 before 10-1-2015 phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True) filtered_phab_df = phab_df[(phab_df['date_created'] < 1443657600) & (phab_df['date_created'] > 1427846400)] #filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)] #removing headless conversations task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']=="task_description"] headed_task_phids = task_phab_df['conversation_id'].unique() filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)] #removing gerrit comments mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True] # filter out the sourceforge migration # Originally from: http://sourceforge.net in the task task_summary migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique() #cut down to only the data that is relevant (mentions http) relevant_conversation_ids = task_phab_df[ task_phab_df['comment_text'].apply(http_relevant) | task_phab_df['task_title'].apply(http_relevant) ]['conversation_id'].unique() task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids) mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids) task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids) mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids) comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)] task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)] #comment_phab_df = mid_comment_phab_df nlp = spacy.load("en_core_web_trf") nlp_coref = spacy.load("en_coreference_web_trf") # use replace_listeners for the coref components nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"]) nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"]) # we won't copy over the span cleaner - this keeps the head cluster information, which we want nlp.add_pipe("merge_entities") nlp.add_pipe("coref", source=nlp_coref) nlp.add_pipe("span_resolver", source=nlp_coref)