In [1]:
import convokit
import pandas as pd
from convokit import Corpus, download



In [2]:
phab_path = "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv"
phab_df = pd.read_csv(phab_path)

In [4]:
#find gerrit phab PHID: PHID-USER-idceizaw6elwiwm5xshb
phab_df['isGerrit'] = phab_df['AuthorPHID'] == 'PHID-USER-idceizaw6elwiwm5xshb'
#cleaning df
phab_df['id'] = phab_df.index + 1
#may have to build out the reply_to column 
phab_df['reply_to'] = phab_df.groupby('TaskPHID')['id'].shift()
phab_df['reply_to'] = phab_df['reply_to'].where(pd.notnull(phab_df['reply_to']), None)

phab_df = phab_df.rename(columns={
    'date_created': 'timestamp',
    'comment_text': 'text',
    'AuthorPHID': 'speaker',
    'TaskPHID': 'conversation_id',
    'WMFaffil':'meta.affil',
    'isGerrit': 'meta.gerrit'
})
# after 11-1-2012 before 11-1-2013
filtered_phab_df = phab_df[(phab_df['timestamp'] < 1383264000) & (phab_df['timestamp'] > 1351728000)]

In [5]:
#removing convos that lack an initial task summary 
task_phab_df = filtered_phab_df[filtered_phab_df['comment_type']=="task_description"]
headed_task_phids = task_phab_df['conversation_id'].unique()
filtered_phab_df = filtered_phab_df[filtered_phab_df['conversation_id'].isin(headed_task_phids)]

In [6]:
phab_corpus = Corpus.from_pandas(filtered_phab_df)

3059it [00:00, 30585.95it/s]



9816it [00:00, 16005.09it/s]




In [7]:
no_bots_phab_corpus = Corpus.filter_utterances(phab_corpus, lambda utt: utt.meta['gerrit'] != True)
no_bots_phab_corpus.print_summary_stats()

Number of Speakers: 230
Number of Utterances: 8804
Number of Conversations: 2081


In [30]:
#looking at how language use differs between the two groups 
# https://github.com/CornellNLP/ConvoKit/blob/master/examples/text-processing/text_preprocessing_demo.ipynb
from convokit.text_processing import TextParser, TextProcessor, TextCleaner
import numpy as np

def preprocess_text(text):
    text = text.replace('*', ' ')
    text = text.replace('-', ' ')
    text = text.replace('\n', ' ')
    return text
    
prep = TextProcessor(proc_fn=preprocess_text, output_field='stripped_text')
cleaner = TextCleaner(replace_text=False)

In [31]:
no_bots_phab_corpus = prep.transform(no_bots_phab_corpus)
no_bots_phab_corpus = cleaner.transform(no_bots_phab_corpus)

100/8804 utterances processed
200/8804 utterances processed
300/8804 utterances processed
400/8804 utterances processed
500/8804 utterances processed
600/8804 utterances processed
700/8804 utterances processed
800/8804 utterances processed
900/8804 utterances processed
1000/8804 utterances processed
1100/8804 utterances processed
1200/8804 utterances processed
1300/8804 utterances processed
1400/8804 utterances processed
1500/8804 utterances processed
1600/8804 utterances processed
1700/8804 utterances processed
1800/8804 utterances processed
1900/8804 utterances processed
2000/8804 utterances processed
2100/8804 utterances processed
2200/8804 utterances processed
2300/8804 utterances processed
2400/8804 utterances processed
2500/8804 utterances processed
2600/8804 utterances processed
2700/8804 utterances processed
2800/8804 utterances processed
2900/8804 utterances processed
3000/8804 utterances processed
3100/8804 utterances processed
3200/8804 utterances processed
3300/8804 utteran

In [39]:
parser = TextParser(input_field='cleaned', verbosity=1000)
no_bots_phab_corpus = parser.transform(no_bots_phab_corpus)

1000/8804 utterances processed
2000/8804 utterances processed
3000/8804 utterances processed
4000/8804 utterances processed
5000/8804 utterances processed
6000/8804 utterances processed
7000/8804 utterances processed
8000/8804 utterances processed
8804/8804 utterances processed


In [40]:
no_bots_phab_corpus.organize_speaker_convo_history()

In [41]:
#https://saifmohammad.com/WebPages/nrc-vad.html
column_headings = ['Word', 'Valence', 'Arousal', 'Domination']
vad_lexicon = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\t', header=None, names=column_headings)

def vad_scoring(text):
    doc = nlp(text)
    vad_scores = {'valence': [], 'arousal': [], 'dominance': []}

    for token in doc:
        lemma = token.lemma_.lower()
        if lemma in vad_dict:
            vad_scores['valence'].append(vad_dict[lemma][0])
            vad_scores['arousal'].append(vad_dict[lemma][1])
            vad_scores['dominance'].append(vad_dict[lemma][2])

    # Compute average scores
    avg_valence = np.mean(vad_scores['valence']) if vad_scores['valence'] else 0
    avg_arousal = np.mean(vad_scores['arousal']) if vad_scores['arousal'] else 0
    avg_dominance = np.mean(vad_scores['dominance']) if vad_scores['dominance'] else 0

    return [avg_valence, avg_arousal, avg_dominance]

In [44]:
vad_scoring = TextProcessor(input_field='cleaned', output_field='vad_scores', 
                           proc_fn=vad_scoring, verbosity=1000)

#vad_scoring = TextProcessor(proc_fn=vad_scoring, output_field='vad_scores')
no_bots_phab_corpus = vad_scoring.transform(no_bots_phab_corpus)

TypeError: 'TextProcessor' object is not callable

In [43]:
no_bots_phab_corpus.utterances

{'709': Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': None, 'id': 'PHID-USER-wil4b5lylrvf3krixlkl', 'temp_backend': {}, 'meta': {}}), 'owner': <convokit.model.corpus.Corpus object at 0x146ce5efcb50>, 'id': '709', 'meta': ConvoKitMeta({'affil': True, 'gerrit': False, 'clean_text': "Tested on both the Italian and the English Wikipedia; Reach the bottom of any page: the style drop down menu does not open (it does when you're at the top of the page); the link inspector does not present the list of options.                               Version  : unspecified   Severity  : major   See Also  : https://bugzilla.wikimedia.org/show_bug.cgi?id=55465", 'parsed': [{'rt': 44, 'toks': [{'tok': 'tested', 'tag': 'VBD', 'dep': 'advcl', 'up': 44, 'dn': [1, 9, 10]}, {'tok': 'on', 'tag': 'IN', 'dep': 'prep', 'up': 0, 'dn': [4]}, {'tok': 'both', 'tag': 'CC', 'dep': 'preconj', 'up': 4, 'dn': []}, {'tok': 'the', 'tag': 'DT', 'dep': 'det