mw-lifecycle-analysis/text_analysis/case3/coref-res.py

import pandas as pd
import spacy
from spacy.tokens import Doc

#because of compute issues, need to do the sampling before the coreference resolution
def http_relevant(text):
    if pd.isnull(text):
        return False
    # expanded dictionary for relevancy
    # http, login, SSL, TLS, certificate
    for word in text.split():
        if "://" not in word.lower():
            #http
            if "http" in word.lower():
                return True
            #login
            if "login" in word.lower():
                return True
            #ssl
            if "ssl" in word.lower():
                return True
            #tls
            if "tls" in word.lower():
                return True
            #cert
            if word.lower().startswith("cert"):
                return True
    return False

def resolving_comment(text):
    doc = nlp(text)
    resolved_text = resolve_references(doc)
    return resolved_text

# Define lightweight function for resolving references in text
def resolve_references(doc: Doc) -> str:
    """Function for resolving references with the coref ouput
    doc (Doc): The Doc object processed by the coref pipeline
    RETURNS (str): The Doc string with resolved references
    """
    # token.idx : token.text
    token_mention_mapper = {}
    output_string = ""
    clusters = [
        val for key, val in doc.spans.items() if key.startswith("coref_cluster")
    ]

    # Iterate through every found cluster
    for cluster in clusters:
        first_mention = cluster[0]
        # Iterate through every other span in the cluster
        for mention_span in list(cluster)[1:]:
            # Set first_mention as value for the first token in mention_span in the token_mention_mapper
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_

            for token in mention_span[1:]:
                # Set empty string for all the other tokens in mention_span
                token_mention_mapper[token.idx] = ""

    # Iterate through every token in the Doc
    for token in doc:
        # Check if token exists in token_mention_mapper
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        # Else add original token text
        else:
            output_string += token.text + token.whitespace_

    return output_string


def is_migrated(comment_text):
    if pd.isnull(comment_text):
        return False
    text = comment_text.strip()
    if text.startswith("Originally from: http://sourceforge.net"):
        return True
    return False

def main():
    phab_path = "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0415_http_phab_comments.csv"
    phab_df = pd.read_csv(phab_path)

    #find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb
    phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'

    #cleaning df
    phab_df['id'] = phab_df.index + 1
    #may have to build out the reply_to column
    phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()
    phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)

    phab_df = phab_df.rename(columns={
        'AuthorPHID': 'speaker',
        'TaskPHID': 'conversation_id',
        'WMFaffil':'meta.affil',
        'isGerrit': 'meta.gerrit'
    })

    # after 04-01-2015 before 10-1-2015
    phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)
    filtered_phab_df = phab_df[(phab_df['date_created'] < 1443657600) & (phab_df['date_created'] > 1427846400)]
    #filtered_phab_df = phab_df[(phab_df['date_created'] < 1381691276) & (phab_df['date_created'] > 1379975444)]

    #removing headless conversations
    task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']=="task_description"]
    headed_task_phids = task_phab_df['conversation_id'].unique()
    filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]

    #removing gerrit comments
    mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]

    # filter out the sourceforge migration
    # Originally from: http://sourceforge.net in the task task_summary
    migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()

    #cut down to only the data that is relevant (mentions http)
    relevant_conversation_ids = task_phab_df[
        task_phab_df['comment_text'].apply(http_relevant) |
        task_phab_df['task_title'].apply(http_relevant)
    ]['conversation_id'].unique()

    task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)
    mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)

    task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)
    mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)

    comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True)
                                          & (mid_comment_phab_df['is_migrated'] != True)]

    task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]
    #comment_phab_df = mid_comment_phab_df

    nlp = spacy.load("en_core_web_trf")
    nlp_coref = spacy.load("en_coreference_web_trf")

    # use replace_listeners for the coref components
    nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
    nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])

    # we won't copy over the span cleaner - this keeps the head cluster information, which we want
    nlp.add_pipe("merge_entities")
    nlp.add_pipe("coref", source=nlp_coref)
    nlp.add_pipe("span_resolver", source=nlp_coref)