In [1]:
import pandas as pd 
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
phab_path = "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0422_http_phab_comments.csv"
phab_df = pd.read_csv(phab_path)

In [3]:
#because of compute issues, need to do the sampling before the coreference resolution
def http_relevant(text):
    if pd.isnull(text):
        return False
    # expanded dictionary for relevancy
    # http, login, SSL, TLS, certificate 
    for word in text.split():
        if "://" not in word.lower():
            #http
            if "http" in word.lower():
                return True
            #login
            if "login" in word.lower():
                return True
            #ssl
            if "ssl" in word.lower():
                return True
            #tls
            if "tls" in word.lower():
                return True
            #cert
            if word.lower().startswith("cert") and not word.lower().startswith("certain"):
                return True
    return False

def is_migrated(comment_text):
    if pd.isnull(comment_text):
        return False
    text = comment_text.strip()
    if text.startswith("Originally from: http://sourceforge.net"):
        return True 
    return False

In [4]:
#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb
phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'

#cleaning df
phab_df['id'] = phab_df.index + 1
#may have to build out the reply_to column 
phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()
phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)

phab_df = phab_df.rename(columns={
    'AuthorPHID': 'speaker',
    'TaskPHID': 'conversation_id',
    'WMFaffil':'meta.affil',
    'isGerrit': 'meta.gerrit'
})

# after 10-01-2014 before 10-01-2015
phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)
#filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] >= 1412207999)]
# after 07-01-2013 before 10-01-2015
filtered_phab_df = phab_df[(phab_df['date_created'] < 1443743999) & (phab_df['date_created'] > 1372636800)]

#removing headless conversations
task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']=="task_description"]
headed_task_phids = task_phab_df['conversation_id'].unique()
filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]

#removing gerrit comments 
mid_comment_phab_df = filtered_phab_df[filtered_phab_df['meta.gerrit'] != True]

# filter out the sourceforge migration 
# Originally from: http://sourceforge.net in the task task_summary
migrated_conversation_ids = task_phab_df[task_phab_df['comment_text'].apply(is_migrated)]['conversation_id'].unique()

#cut down to only the data that is relevant (mentions http)
relevant_conversation_ids = task_phab_df[
    task_phab_df['comment_text'].apply(http_relevant) |
    task_phab_df['task_title'].apply(http_relevant)
]['conversation_id'].unique()

task_phab_df['is_relevant'] = task_phab_df['conversation_id'].isin(relevant_conversation_ids)
mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)

task_phab_df['is_migrated'] = task_phab_df['conversation_id'].isin(migrated_conversation_ids)
mid_comment_phab_df['is_migrated'] = mid_comment_phab_df['conversation_id'].isin(migrated_conversation_ids)

comment_phab_df = mid_comment_phab_df[(mid_comment_phab_df['is_relevant'] == True) & (mid_comment_phab_df['is_migrated'] != True)]
task_phab_df = task_phab_df[(task_phab_df['is_relevant'] == True) & (task_phab_df['is_migrated'] != True)]
#comment_phab_df = mid_comment_phab_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
comment_phab_df

Unnamed: 0,task_title,comment_text,date_created,speaker,meta.affil,conversation_id,comment_type,status,meta.gerrit,id,reply_to,timestamp,is_relevant,is_migrated
197,Creation of instances broken,"After a replace of old instances, it is not po...",1442753295,PHID-USER-qlodcndtwpolbdhncjis,False,PHID-TASK-pitdrld6mszruqmc6usf,task_description,resolved,False,198,,2015-09-20 12:48:15+00:00,True,False
198,Creation of instances broken,Works now.,1442864673,PHID-USER-qlodcndtwpolbdhncjis,False,PHID-TASK-pitdrld6mszruqmc6usf,task_subcomment,,False,199,198.0,2015-09-21 19:44:33+00:00,True,False
199,Creation of instances broken,"Ok, the instances are deleted now, I will recr...",1442864271,PHID-USER-qlodcndtwpolbdhncjis,False,PHID-TASK-pitdrld6mszruqmc6usf,task_subcomment,,False,200,199.0,2015-09-21 19:37:51+00:00,True,False
200,Creation of instances broken,The new instances have the same names as recen...,1442854156,PHID-USER-22bsa5u75jz3ci3wnplu,False,PHID-TASK-pitdrld6mszruqmc6usf,task_subcomment,,False,201,200.0,2015-09-21 16:49:16+00:00,True,False
201,Creation of instances broken,This happens also with jessie and presice inst...,1442835238,PHID-USER-qlodcndtwpolbdhncjis,False,PHID-TASK-pitdrld6mszruqmc6usf,task_subcomment,,False,202,201.0,2015-09-21 11:33:58+00:00,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406887,Allow login using mosh as an alternative to pl...,*** Bug 49454 has been marked as a duplicate o...,1379011061,PHID-USER-2nnm76h4ykalvvref2ye,False,PHID-TASK-hnwvtmwgpm2oisoqaozt,task_subcomment,,False,406888,406887.0,2013-09-12 18:37:41+00:00,True,False
406888,Allow login using mosh as an alternative to pl...,"JFTR, on Tools mosh-server processes eat up to...",1376245807,PHID-USER-vk6mlmacfhx77egryy5i,False,PHID-TASK-hnwvtmwgpm2oisoqaozt,task_subcomment,,False,406889,406888.0,2013-08-11 18:30:07+00:00,True,False
406889,Allow login using mosh as an alternative to pl...,"This is supported on tools, but adding it to t...",1376185312,PHID-USER-h75guknmwivm6x37iute,False,PHID-TASK-hnwvtmwgpm2oisoqaozt,task_subcomment,,False,406890,406889.0,2013-08-11 01:41:52+00:00,True,False
406890,Allow login using mosh as an alternative to pl...,Just found out that mosh already works for too...,1376118400,PHID-USER-5dqihbanu3caaj7pigif,False,PHID-TASK-hnwvtmwgpm2oisoqaozt,task_subcomment,,False,406891,406890.0,2013-08-10 07:06:40+00:00,True,False


In [12]:
prior_path = "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0050825_coref-rel-first.csv"
prior_df = pd.read_csv(prior_path)
comment_phab_df['timestamp'] = pd.to_datetime(comment_phab_df['timestamp'], utc=True)
prior_df['timestamp'] = pd.to_datetime(prior_df['timestamp'], utc=True)
merged_df = comment_phab_df.merge(prior_df, how='outer', indicator=True)
len(merged_df)
only_in_comment_phab_df = merged_df[merged_df['_merge'] == 'left_only']
len(only_in_comment_phab_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


862

In [13]:
nlp = spacy.load("en_core_web_trf")
nlp_coref = spacy.load("en_coreference_web_trf")

# use replace_listeners for the coref components
nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])

# we won't copy over the span cleaner - this keeps the head cluster information, which we want
nlp.add_pipe("merge_entities")
nlp.add_pipe("coref", source=nlp_coref)
nlp.add_pipe("span_resolver", source=nlp_coref)

<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x154d9952a7c0>

In [14]:
# https://github.com/explosion/spaCy/discussions/13572
# https://github.com/explosion/spaCy/issues/13111 
# https://explosion.ai/blog/coref
# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466
doc = nlp("John is frustrated with the VisualEditor project, he thinks it doesn't work.")


In [15]:
doc

John is frustrated with the VisualEditor project, he thinks it doesn't work.

In [16]:
# https://gist.github.com/thomashacker/b5dd6042c092e0a22c2b9243a64a2466
from spacy.tokens import Doc
# Define lightweight function for resolving references in text
def resolve_references(doc: Doc) -> str:
    """Function for resolving references with the coref ouput
    doc (Doc): The Doc object processed by the coref pipeline
    RETURNS (str): The Doc string with resolved references
    """
    # token.idx : token.text
    token_mention_mapper = {}
    output_string = ""
    clusters = [
        val for key, val in doc.spans.items() if key.startswith("coref_cluster")
    ]

    # Iterate through every found cluster
    for cluster in clusters:
        first_mention = cluster[0]
        # Iterate through every other span in the cluster
        for mention_span in list(cluster)[1:]:
            # Set first_mention as value for the first token in mention_span in the token_mention_mapper
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
            
            for token in mention_span[1:]:
                # Set empty string for all the other tokens in mention_span
                token_mention_mapper[token.idx] = ""

    # Iterate through every token in the Doc
    for token in doc:
        # Check if token exists in token_mention_mapper
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        # Else add original token text
        else:
            output_string += token.text + token.whitespace_

    return output_string


In [17]:
def resolving_comment(text):
    doc = nlp(text)
    resolved_text = resolve_references(doc)
    return resolved_text

In [18]:
resolving_comment("i hate ve.interface, it always messes up my browser")

'i hate ve.interface, ve.interface always messes up i browser'

In [19]:
only_in_comment_phab_df['text'] = only_in_comment_phab_df['comment_text'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
only_in_comment_phab_df['resolved_text'] = only_in_comment_phab_df['text'].apply(resolving_comment)
only_in_comment_phab_df.to_csv("/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv", index=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [33]:
only_in_comment_phab_df['resolved_text']

7423     [Backport was merged into 1.24wmf16 upon a tim...
7902     I guess this can be closed now as RESOLVED WOR...
7905     The upstream issue is https://github.com/jcgre...
7906     An update on this. In Amsterdam we found at th...
7907     Yes. It's used by people using pywikibot-as-a-...
                               ...                        
14465    I amended the title to the range IE8-10 becaus...
14466    If I remember correctly this problem was at le...
14467    If I remember correctly this problem was at le...
14468    After a quick test, autocomplete seems to work...
14478        Still not merged, so we can't really do much.
Name: resolved_text, Length: 862, dtype: object

In [None]:
only_in_comment_phab_df.to_csv("/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/050825_coref_rel_phab_stragglers.csv", index=False)