{ "cells": [ { "cell_type": "code", "execution_count": 35, "id": "18977cd3-de19-43e0-87a7-79ddc0fda973", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import spacy" ] }, { "cell_type": "code", "execution_count": 36, "id": "afe53935-8175-4dee-bb27-0bc87fbd5d5f", "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 3, "id": "70a77468-aba1-4042-9900-d3c96d4c9f88", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt_tab to\n", "[nltk_data] /mmfs1/home/mjilg/nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "from nltk.tokenize import sent_tokenize\n", "nltk.download('punkt_tab')" ] }, { "cell_type": "code", "execution_count": 4, "id": "8ab815f2-b63b-429b-9036-3870aae54fad", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "fa6a7cea-1375-4153-a388-1847dfa5b257", "metadata": {}, "outputs": [], "source": [ "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv\"\n", "phab_df = pd.read_csv(phab_path)" ] }, { "cell_type": "code", "execution_count": 7, "id": "812ab4c8-2561-466b-bc57-defc93f5c893", "metadata": {}, "outputs": [], "source": [ "#cleaning df \n", "phab_df['has_ref'] = phab_df['comment_text'].apply(lambda x: bool(re.search(r\" visualeditor| VE | ve |VisualEditor\", str(x), re.IGNORECASE)))\n", "phab_df['has_bot_ref'] = phab_df['comment_text'].apply(lambda x: bool(re.search(r\" bots | scripts | gadgets \", str(x), re.IGNORECASE)))\n", "phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n", "phab_df['comment_id'] = phab_df.index + 1\n", "filtered_phab_df = phab_df[(phab_df['date_created'] < 1383264000) & (phab_df['date_created'] > 1351728000)]" ] }, { "cell_type": "code", "execution_count": 8, "id": "7babf07b-4f91-4e48-88a9-4fe10f8b668d", "metadata": {}, "outputs": [], "source": [ "#turning df into sentences \n", "sentence_level_data = filtered_phab_df.explode(\n", " 'comment_text').reset_index(drop=True)\n", "\n", "sentence_level_data['sentence'] = sentence_level_data['comment_text'].apply(\n", " lambda x: sent_tokenize(str(x)))\n", "\n", "sentence_level_data = sentence_level_data.explode('sentence').reset_index(drop=True)\n", "\n", "sentence_level_data['sentence_id'] = sentence_level_data.groupby('comment_id').cumcount() + 1\n", "\n", "sentence_level_data = sentence_level_data.drop(columns=['has_bot_ref', 'has_ref'], errors='ignore')\n", "\n", "sentence_level_data['has_ref'] = sentence_level_data['sentence'].apply(\n", " lambda x: bool(re.search(r'visualeditor| VE | ve |VisualEditor', x)))\n", "\n", "sentence_level_data['has_bot_ref'] = sentence_level_data['sentence'].apply(\n", " lambda x: bool(re.search(r'bots|scripts|gadgets', x)))\n", "\n", "sentence_level_data = sentence_level_data.reset_index(drop=True)\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "acb87a1a-c3e0-4d3f-8450-e2af96150e94", "metadata": {}, "outputs": [], "source": [ "def extract_dependency_tree(sentence):\n", " doc = nlp(sentence)\n", " return [(token.text, token.dep_, token.head.text) for token in doc]" ] }, { "cell_type": "code", "execution_count": 10, "id": "b67c136e-16c4-4002-a2d6-f92c88252baf", "metadata": {}, "outputs": [], "source": [ "filtered_sentence_level_data = sentence_level_data[sentence_level_data['has_ref'] == True]" ] }, { "cell_type": "code", "execution_count": 12, "id": "f749706a-f2bb-42e3-aae5-3876b00c48ad", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_88113/2706376531.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " filtered_sentence_level_data['dependency_tree'] = filtered_sentence_level_data['sentence'].apply(extract_dependency_tree)\n" ] } ], "source": [ "filtered_sentence_level_data['dependency_tree'] = filtered_sentence_level_data['sentence'].apply(extract_dependency_tree)\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "82c48463-5a90-4105-9ee9-5763d0b1e35b", "metadata": {}, "outputs": [], "source": [ "dependency_relations = []\n", "\n", "for index, row in filtered_sentence_level_data.iterrows():\n", " sentence = row['sentence']\n", " timestamp = row['timestamp']\n", " comment_id = row['comment_id']\n", " sentence_id = row['sentence_id']\n", " \n", " for token, dep, head in row['dependency_tree']:\n", " if re.search(r'visualeditor|VE|ve|VisualEditor', token, re.IGNORECASE):\n", " dependency_relations.append({\n", " 'sentence_id': sentence_id,\n", " 'comment_id': comment_id,\n", " 'timestamp': timestamp,\n", " 'token': token,\n", " 'dependency': dep,\n", " 'head': head\n", " })\n", " \n", "dependency_relations_df = pd.DataFrame(dependency_relations)" ] }, { "cell_type": "code", "execution_count": 15, "id": "60d7d808-0211-41b3-8dfa-0143dbba94bd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | sentence_id | \n", "comment_id | \n", "timestamp | \n", "token | \n", "dependency | \n", "head | \n", "
---|---|---|---|---|---|---|
0 | \n", "1 | \n", "725 | \n", "2013-10-03 20:15:00+00:00 | \n", "ve.base | \n", "pobj | \n", "for | \n", "
1 | \n", "1 | \n", "725 | \n", "2013-10-03 20:15:00+00:00 | \n", "have | \n", "aux | \n", "switched | \n", "
2 | \n", "1 | \n", "725 | \n", "2013-10-03 20:15:00+00:00 | \n", "VisualEditor | \n", "nsubjpass | \n", "broken | \n", "
3 | \n", "1 | \n", "747 | \n", "2013-09-28 15:44:00+00:00 | \n", "VE | \n", "pobj | \n", "of | \n", "
4 | \n", "5 | \n", "750 | \n", "2013-09-30 23:29:57+00:00 | \n", "VisualEditor | \n", "nsubj | \n", "switched | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
5744 | \n", "2 | \n", "32259 | \n", "2013-10-06 23:34:50+00:00 | \n", "autosave | \n", "advcl | \n", "allow | \n", "
5745 | \n", "2 | \n", "32259 | \n", "2013-10-06 23:34:50+00:00 | \n", "VE | \n", "pobj | \n", "in | \n", "
5746 | \n", "2 | \n", "32330 | \n", "2013-04-28 17:25:19+00:00 | \n", "VE | \n", "dobj | \n", "enabling | \n", "
5747 | \n", "7 | \n", "32338 | \n", "2013-03-06 06:29:37+00:00 | \n", "VisualEditor | \n", "pobj | \n", "in | \n", "
5748 | \n", "7 | \n", "32338 | \n", "2013-03-06 06:29:37+00:00 | \n", "have | \n", "advcl | \n", "minified | \n", "
5749 rows × 6 columns
\n", "\n", " | sentence_id | \n", "comment_id | \n", "timestamp | \n", "token | \n", "dependency | \n", "head | \n", "
---|---|---|---|---|---|---|
0 | \n", "1 | \n", "725 | \n", "2013-10-03 20:15:00+00:00 | \n", "ve.base | \n", "pobj | \n", "for | \n", "
2 | \n", "1 | \n", "725 | \n", "2013-10-03 20:15:00+00:00 | \n", "VisualEditor | \n", "nsubjpass | \n", "broken | \n", "
3 | \n", "1 | \n", "747 | \n", "2013-09-28 15:44:00+00:00 | \n", "VE | \n", "pobj | \n", "of | \n", "
4 | \n", "5 | \n", "750 | \n", "2013-09-30 23:29:57+00:00 | \n", "VisualEditor | \n", "nsubj | \n", "switched | \n", "
5 | \n", "3 | \n", "752 | \n", "2013-09-30 07:16:14+00:00 | \n", "VE | \n", "nsubj | \n", "being | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
5742 | \n", "5 | \n", "32251 | \n", "2013-10-08 21:22:02+00:00 | \n", "VisualEditor | \n", "poss | \n", "take | \n", "
5743 | \n", "2 | \n", "32259 | \n", "2013-10-06 23:34:50+00:00 | \n", "VE | \n", "nsubj | \n", "autosave | \n", "
5745 | \n", "2 | \n", "32259 | \n", "2013-10-06 23:34:50+00:00 | \n", "VE | \n", "pobj | \n", "in | \n", "
5746 | \n", "2 | \n", "32330 | \n", "2013-04-28 17:25:19+00:00 | \n", "VE | \n", "dobj | \n", "enabling | \n", "
5747 | \n", "7 | \n", "32338 | \n", "2013-03-06 06:29:37+00:00 | \n", "VisualEditor | \n", "pobj | \n", "in | \n", "
3579 rows × 6 columns
\n", "