{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "18977cd3-de19-43e0-87a7-79ddc0fda973", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML\n", " warnings.warn(\"Can't initialize NVML\")\n" ] } ], "source": [ "import pandas as pd \n", "import spacy" ] }, { "cell_type": "code", "execution_count": 2, "id": "afe53935-8175-4dee-bb27-0bc87fbd5d5f", "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 3, "id": "70a77468-aba1-4042-9900-d3c96d4c9f88", "metadata": {}, "outputs": [], "source": [ "import nltk\n", "from nltk.tokenize import sent_tokenize\n", "#nltk.download('punkt_tab')" ] }, { "cell_type": "code", "execution_count": 4, "id": "8ab815f2-b63b-429b-9036-3870aae54fad", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "fa6a7cea-1375-4153-a388-1847dfa5b257", "metadata": {}, "outputs": [], "source": [ "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv\"\n", "phab_df = pd.read_csv(phab_path)" ] }, { "cell_type": "code", "execution_count": 7, "id": "812ab4c8-2561-466b-bc57-defc93f5c893", "metadata": {}, "outputs": [], "source": [ "#cleaning df \n", "phab_df['has_ref'] = phab_df['comment_text'].apply(lambda x: bool(re.search(r\"\\b(visualeditor|VE|ve|VisualEditor)\\b\", str(x), re.IGNORECASE)))\n", "phab_df['has_bot_ref'] = phab_df['comment_text'].apply(lambda x: bool(re.search(r\"\\b(bots|scripts|gadgets)\\b\", str(x), re.IGNORECASE)))\n", "phab_df['timestamp'] = pd.to_datetime(phab_df['date_created'], unit='s', origin='unix', utc=True)\n", "phab_df['comment_id'] = phab_df.index + 1\n", "# after 11-1-2012 before 11-1-2013\n", "filtered_phab_df = phab_df[(phab_df['date_created'] < 1383264000) & (phab_df['date_created'] > 1351728000)]" ] }, { "cell_type": "code", "execution_count": 8, "id": "60bcef32-67be-44f5-a51a-84e6e63d29ed", "metadata": {}, "outputs": [], "source": [ "def remove_urls(text):\n", " return re.sub(r'http\\S+', '', text)" ] }, { "cell_type": "code", "execution_count": 9, "id": "7babf07b-4f91-4e48-88a9-4fe10f8b668d", "metadata": {}, "outputs": [], "source": [ "#turning df into sentences \n", "sentence_level_data = filtered_phab_df.explode(\n", " 'comment_text').reset_index(drop=True)\n", "\n", "sentence_level_data['sentence'] = sentence_level_data['comment_text'].apply(\n", " lambda x: sent_tokenize(str(x)))\n", "\n", "sentence_level_data = sentence_level_data.explode('sentence').reset_index(drop=True)\n", "\n", "sentence_level_data['sentence_id'] = sentence_level_data.groupby('comment_id').cumcount() + 1\n", "\n", "sentence_level_data = sentence_level_data.drop(columns=['has_bot_ref', 'has_ref'], errors='ignore')\n", "\n", "sentence_level_data['has_ref'] = sentence_level_data['sentence'].apply(\n", " lambda x: bool(re.search(r'\\b(visualeditor|VE|ve|VisualEditor)\\b', x)))\n", "\n", "sentence_level_data['has_bot_ref'] = sentence_level_data['sentence'].apply(\n", " lambda x: bool(re.search(r'\\b(bots|scripts|gadgets)\\b', x)))\n", "\n", "sentence_level_data = sentence_level_data.reset_index(drop=True)\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "558d1638-abe9-4fc2-896e-6fc1bc396ca3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | task_title | \n", "comment_text | \n", "date_created | \n", "AuthorPHID | \n", "WMFaffil | \n", "TaskPHID | \n", "comment_type | \n", "status | \n", "timestamp | \n", "comment_id | \n", "sentence | \n", "sentence_id | \n", "has_ref | \n", "has_bot_ref | \n", "sentence_no_url | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Tested on both the Italian and the English Wik... | \n", "1380976920 | \n", "PHID-USER-wil4b5lylrvf3krixlkl | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_description | \n", "resolved | \n", "2013-10-05 12:42:00+00:00 | \n", "709 | \n", "Tested on both the Italian and the English Wik... | \n", "1 | \n", "False | \n", "False | \n", "Tested on both the Italian and the English Wik... | \n", "
1 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Tested on both the Italian and the English Wik... | \n", "1380976920 | \n", "PHID-USER-wil4b5lylrvf3krixlkl | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_description | \n", "resolved | \n", "2013-10-05 12:42:00+00:00 | \n", "709 | \n", "--------------------------\\n**Version**: unspe... | \n", "2 | \n", "False | \n", "False | \n", "--------------------------\\n**Version**: unspe... | \n", "
2 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Note that this is fixed and has been deployed ... | \n", "1381281033 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "2013-10-09 01:10:33+00:00 | \n", "710 | \n", "Note that this is fixed and has been deployed ... | \n", "1 | \n", "False | \n", "False | \n", "Note that this is fixed and has been deployed ... | \n", "
3 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Note that this is fixed and has been deployed ... | \n", "1381281033 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "2013-10-09 01:10:33+00:00 | \n", "710 | \n", "Please re-open if it still occurs for you afte... | \n", "2 | \n", "False | \n", "False | \n", "Please re-open if it still occurs for you afte... | \n", "
4 | \n", "VisualEditor: [Regression] \"More\" menu gets sh... | \n", "Change 88226 merged by Robmoen:\\nThe amazing m... | \n", "1381274919 | \n", "PHID-USER-idceizaw6elwiwm5xshb | \n", "True | \n", "PHID-TASK-64s56xzrc22ustp2z7wx | \n", "task_subcomment | \n", "NaN | \n", "2013-10-08 23:28:39+00:00 | \n", "711 | \n", "Change 88226 merged by Robmoen:\\nThe amazing m... | \n", "1 | \n", "False | \n", "False | \n", "Change 88226 merged by Robmoen:\\nThe amazing m... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
32483 | \n", "Implement minification for SVG files in Resour... | \n", "I think we can apply the same as PNG files.\\n\\... | \n", "1362551377 | \n", "PHID-USER-sai77mtxmpqnm6pycyvz | \n", "False | \n", "PHID-TASK-s5at4xy2rbu4kz3beaqh | \n", "task_subcomment | \n", "NaN | \n", "2013-03-06 06:29:37+00:00 | \n", "32338 | \n", "I know SVG is text where PNG is not, but unles... | \n", "3 | \n", "False | \n", "False | \n", "I know SVG is text where PNG is not, but unles... | \n", "
32484 | \n", "Implement minification for SVG files in Resour... | \n", "I think we can apply the same as PNG files.\\n\\... | \n", "1362551377 | \n", "PHID-USER-sai77mtxmpqnm6pycyvz | \n", "False | \n", "PHID-TASK-s5at4xy2rbu4kz3beaqh | \n", "task_subcomment | \n", "NaN | \n", "2013-03-06 06:29:37+00:00 | \n", "32338 | \n", "As for solving this bug, if we want to minify ... | \n", "4 | \n", "False | \n", "False | \n", "As for solving this bug, if we want to minify ... | \n", "
32485 | \n", "Implement minification for SVG files in Resour... | \n", "I think we can apply the same as PNG files.\\n\\... | \n", "1362551377 | \n", "PHID-USER-sai77mtxmpqnm6pycyvz | \n", "False | \n", "PHID-TASK-s5at4xy2rbu4kz3beaqh | \n", "task_subcomment | \n", "NaN | \n", "2013-03-06 06:29:37+00:00 | \n", "32338 | \n", "fonts), that would get quite tricky. | \n", "5 | \n", "False | \n", "False | \n", "fonts), that would get quite tricky. | \n", "
32486 | \n", "Implement minification for SVG files in Resour... | \n", "I think we can apply the same as PNG files.\\n\\... | \n", "1362551377 | \n", "PHID-USER-sai77mtxmpqnm6pycyvz | \n", "False | \n", "PHID-TASK-s5at4xy2rbu4kz3beaqh | \n", "task_subcomment | \n", "NaN | \n", "2013-03-06 06:29:37+00:00 | \n", "32338 | \n", "They'd need a url to be served from. | \n", "6 | \n", "False | \n", "False | \n", "They'd need a url to be served from. | \n", "
32487 | \n", "Implement minification for SVG files in Resour... | \n", "I think we can apply the same as PNG files.\\n\\... | \n", "1362551377 | \n", "PHID-USER-sai77mtxmpqnm6pycyvz | \n", "False | \n", "PHID-TASK-s5at4xy2rbu4kz3beaqh | \n", "task_subcomment | \n", "NaN | \n", "2013-03-06 06:29:37+00:00 | \n", "32338 | \n", "As for SVGs we embed in CSS (SVGs for icons li... | \n", "7 | \n", "True | \n", "False | \n", "As for SVGs we embed in CSS (SVGs for icons li... | \n", "
32488 rows × 15 columns
\n", "\n", " | sentence_id | \n", "comment_id | \n", "timestamp | \n", "wmfAffil | \n", "token | \n", "dependency | \n", "head | \n", "depth | \n", "children | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "2 | \n", "723 | \n", "2013-10-06 10:36:55+00:00 | \n", "True | \n", "’ve | \n", "aux | \n", "attached | \n", "1 | \n", "0 | \n", "
1 | \n", "1 | \n", "725 | \n", "2013-10-03 20:15:00+00:00 | \n", "False | \n", "ve.base | \n", "pobj | \n", "for | \n", "3 | \n", "1 | \n", "
2 | \n", "1 | \n", "725 | \n", "2013-10-03 20:15:00+00:00 | \n", "False | \n", "VisualEditor | \n", "nsubjpass | \n", "broken | \n", "1 | \n", "0 | \n", "
3 | \n", "1 | \n", "730 | \n", "2013-10-03 20:32:16+00:00 | \n", "False | \n", "ve.base | \n", "compound | \n", "request | \n", "2 | \n", "0 | \n", "
4 | \n", "2 | \n", "730 | \n", "2013-10-03 20:32:16+00:00 | \n", "False | \n", "ve.core | \n", "compound | \n", "request | \n", "3 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
5430 | \n", "2 | \n", "32259 | \n", "2013-10-06 23:34:50+00:00 | \n", "False | \n", "VE | \n", "pobj | \n", "in | \n", "5 | \n", "0 | \n", "
5431 | \n", "1 | \n", "32324 | \n", "2012-12-06 18:16:58+00:00 | \n", "False | \n", "'ve | \n", "aux | \n", "determined | \n", "1 | \n", "0 | \n", "
5432 | \n", "1 | \n", "32324 | \n", "2012-12-06 18:16:58+00:00 | \n", "False | \n", "ve.ce | \n", "pobj | \n", "within | \n", "3 | \n", "0 | \n", "
5433 | \n", "2 | \n", "32330 | \n", "2013-04-28 17:25:19+00:00 | \n", "False | \n", "VE | \n", "dobj | \n", "enabling | \n", "6 | \n", "0 | \n", "
5434 | \n", "7 | \n", "32338 | \n", "2013-03-06 06:29:37+00:00 | \n", "False | \n", "VisualEditor | \n", "pobj | \n", "in | \n", "4 | \n", "0 | \n", "
5435 rows × 9 columns
\n", "\n", " | sentence_id | \n", "comment_id | \n", "timestamp | \n", "token | \n", "dependency | \n", "head | \n", "depth | \n", "children | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "5 | \n", "730 | \n", "2013-10-03 20:32:16+00:00 | \n", "scripts | \n", "pobj | \n", "in | \n", "3 | \n", "2 | \n", "
1 | \n", "1 | \n", "3948 | \n", "2013-10-25 23:44:17+00:00 | \n", "gadgets | \n", "dobj | \n", "load | \n", "3 | \n", "1 | \n", "
2 | \n", "2 | \n", "3948 | \n", "2013-10-25 23:44:17+00:00 | \n", "gadgets | \n", "nsubj | \n", "reach | \n", "1 | \n", "0 | \n", "
3 | \n", "4 | \n", "3963 | \n", "2013-10-04 15:10:00+00:00 | \n", "load.php?debug=false&lang=en&modules=jquery%2C... | \n", "compound | \n", "Exception | \n", "4 | \n", "1 | \n", "
4 | \n", "4 | \n", "3963 | \n", "2013-10-04 15:10:00+00:00 | \n", "load.php?debug=false&lang=en&modules=jquery%2C... | \n", "compound | \n", "Exception | \n", "4 | \n", "2 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
175 | \n", "2 | \n", "29956 | \n", "2013-03-15 11:27:23+00:00 | \n", "bots | \n", "nsubjpass | \n", "voiced | \n", "1 | \n", "2 | \n", "
176 | \n", "6 | \n", "30140 | \n", "2013-06-20 20:09:44+00:00 | \n", "scripts | \n", "pobj | \n", "For | \n", "2 | \n", "2 | \n", "
177 | \n", "3 | \n", "31749 | \n", "2013-08-07 13:35:56+00:00 | \n", "scripts | \n", "attr | \n", "be | \n", "3 | \n", "4 | \n", "
178 | \n", "6 | \n", "31907 | \n", "2013-06-30 19:20:13+00:00 | \n", "gadgets | \n", "dobj | \n", "support | \n", "3 | \n", "0 | \n", "
179 | \n", "8 | \n", "31907 | \n", "2013-06-30 19:20:13+00:00 | \n", "gadgets | \n", "dobj | \n", "support | \n", "3 | \n", "0 | \n", "
180 rows × 8 columns
\n", "\n", " | sentence_id | \n", "comment_id | \n", "timestamp | \n", "token | \n", "dependency | \n", "head | \n", "depth | \n", "children | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "2 | \n", "747 | \n", "2013-09-28 15:44:00+00:00 | \n", "WMF | \n", "nsubj | \n", "comes | \n", "1 | \n", "1 | \n", "
1 | \n", "5 | \n", "750 | \n", "2013-09-30 23:29:57+00:00 | \n", "Wikimedia | \n", "compound | \n", "wiki | \n", "4 | \n", "0 | \n", "
2 | \n", "2 | \n", "752 | \n", "2013-09-30 07:16:14+00:00 | \n", "WMF | \n", "appos | \n", "You | \n", "4 | \n", "0 | \n", "
3 | \n", "2 | \n", "753 | \n", "2013-09-28 19:11:06+00:00 | \n", "Wikimedia | \n", "compound | \n", "wiki | \n", "3 | \n", "0 | \n", "
4 | \n", "2 | \n", "1050 | \n", "2013-06-27 00:46:02+00:00 | \n", "WMF | \n", "pobj | \n", "at | \n", "2 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
431 | \n", "3 | \n", "32254 | \n", "2013-10-07 02:30:30+00:00 | \n", "Wikimedia | \n", "compound | \n", "wikis | \n", "8 | \n", "0 | \n", "
432 | \n", "1 | \n", "32256 | \n", "2013-10-07 00:58:28+00:00 | \n", "wmf | \n", "compound | \n", "wiki | \n", "5 | \n", "0 | \n", "
433 | \n", "1 | \n", "32257 | \n", "2013-10-07 00:44:40+00:00 | \n", "WMF | \n", "compound | \n", "wiki | \n", "3 | \n", "0 | \n", "
434 | \n", "2 | \n", "32258 | \n", "2013-10-07 00:38:54+00:00 | \n", "WMF | \n", "amod | \n", "wikis | \n", "3 | \n", "0 | \n", "
435 | \n", "2 | \n", "32258 | \n", "2013-10-07 00:38:54+00:00 | \n", "WMF | \n", "compound | \n", "wiki | \n", "3 | \n", "0 | \n", "
436 rows × 8 columns
\n", "