{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ba9e5acd-e17d-4318-9272-04c9f6706186", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import spacy" ] }, { "cell_type": "code", "execution_count": 2, "id": "e4f0b3f0-5255-46f1-822f-e455087ba315", "metadata": {}, "outputs": [], "source": [ "phab_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/051825_coref_rel_phab_comments.csv\"\n", "phab_df = pd.read_csv(phab_path)" ] }, { "cell_type": "code", "execution_count": 3, "id": "ac5e624b-08a4-4ede-bc96-cfc26c3edac3", "metadata": {}, "outputs": [], "source": [ "def http_relevant(text):\n", " if pd.isnull(text):\n", " return False\n", " # expanded dictionary for relevancy\n", " # http, login, SSL, TLS, certificate \n", " for word in text.split():\n", " if \"://\" not in word.lower():\n", " #http\n", " if \"http\" in word.lower():\n", " return True\n", " #login\n", " if \"login\" in word.lower():\n", " return True\n", " #ssl\n", " if \"ssl\" in word.lower():\n", " return True\n", " #tls\n", " if \"tls\" in word.lower():\n", " return True\n", " #cert\n", " if word.lower().startswith(\"cert\") and not word.lower().startswith(\"certain\"):\n", " return True\n", " return False" ] }, { "cell_type": "code", "execution_count": 4, "id": "d5925c49-ea1d-4813-98aa-eae10d5879ca", "metadata": {}, "outputs": [], "source": [ "def is_migrated(comment_text):\n", " if pd.isnull(comment_text):\n", " return False\n", " text = comment_text.strip()\n", " if text.startswith(\"Originally from: http://sourceforge.net\"):\n", " return True \n", " return False" ] }, { "cell_type": "code", "execution_count": 6, "id": "c05f8b0d-ae4c-4cd5-8832-edb54e36ed9a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | task_title | \n", "comment_text | \n", "date_created | \n", "speaker | \n", "meta.affil | \n", "conversation_id | \n", "comment_type | \n", "status | \n", "meta.gerrit | \n", "id | \n", "reply_to | \n", "timestamp | \n", "is_relevant | \n", "is_migrated | \n", "text | \n", "resolved_text | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "time data error | \n", "After last update via SVN bot does not work, s... | \n", "1381482240 | \n", "PHID-USER-wwnv7nzuscfuc2xfjwbq | \n", "False | \n", "PHID-TASK-qjt5coghg7n62wamkubq | \n", "task_description | \n", "resolved | \n", "False | \n", "115 | \n", "NaN | \n", "2013-10-11 09:04:00+00:00 | \n", "True | \n", "False | \n", "After last update via SVN bot does not work, s... | \n", "After last update via SVN bot does not work, s... | \n", "
1 | \n", "time data error | \n", "SVN r10320 is https://gerrit.wikimedia.org/r/8... | \n", "1381484030 | \n", "PHID-USER-xezsyhikbr7hjrig2ofp | \n", "False | \n", "PHID-TASK-qjt5coghg7n62wamkubq | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "118 | \n", "117.0 | \n", "2013-10-11 09:33:50+00:00 | \n", "True | \n", "False | \n", "SVN r10320 is https://gerrit.wikimedia.org/r/8... | \n", "SVN r10320 is https://gerrit.wikimedia.org/r/8... | \n", "
2 | \n", "time data error | \n", "see also bug 55399 | \n", "1381483747 | \n", "PHID-USER-xezsyhikbr7hjrig2ofp | \n", "False | \n", "PHID-TASK-qjt5coghg7n62wamkubq | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "119 | \n", "118.0 | \n", "2013-10-11 09:29:07+00:00 | \n", "True | \n", "False | \n", "see also bug 55399 | \n", "see also bug 55399 | \n", "
3 | \n", "time data error | \n", "It's a mess with these timestamps. Without tha... | \n", "1381483651 | \n", "PHID-USER-xezsyhikbr7hjrig2ofp | \n", "False | \n", "PHID-TASK-qjt5coghg7n62wamkubq | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "120 | \n", "119.0 | \n", "2013-10-11 09:27:31+00:00 | \n", "True | \n", "False | \n", "It's a mess with these timestamps. Without tha... | \n", "It's a mess with these timestamps. Without tha... | \n", "
4 | \n", "time data error | \n", "When I go back from SVN revision 10320 to 1031... | \n", "1381482504 | \n", "PHID-USER-wwnv7nzuscfuc2xfjwbq | \n", "False | \n", "PHID-TASK-qjt5coghg7n62wamkubq | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "121 | \n", "120.0 | \n", "2013-10-11 09:08:24+00:00 | \n", "True | \n", "False | \n", "When I go back from SVN revision 10320 to 1031... | \n", "When I go back from SVN revision 10320 to 1031... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
6510 | \n", "VisualEditor: Automatic naming scheme for ref... | \n", "Intention:\\nRe-use a reference.\\n\\n\\nActual Re... | \n", "1385163660 | \n", "PHID-USER-uf3buojo4ceizjywvyn5 | \n", "True | \n", "PHID-TASK-j3rfh4pmjx4pel7dk2tn | \n", "task_description | \n", "duplicate | \n", "False | \n", "155659 | \n", "NaN | \n", "2013-11-22 23:41:00+00:00 | \n", "True | \n", "False | \n", "Intention:\\nRe-use a reference.\\n\\n\\nActual Re... | \n", "Intention:\\nRe-use a reference.\\n\\n\\nActual Re... | \n", "
6511 | \n", "VisualEditor: Automatic naming scheme for ref... | \n", "Speaking as an extensive editor, I just find t... | \n", "1385399054 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-j3rfh4pmjx4pel7dk2tn | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "155661 | \n", "155660.0 | \n", "2013-11-25 17:04:14+00:00 | \n", "True | \n", "False | \n", "Speaking as an extensive editor, I just find t... | \n", "Speaking as an extensive editor, I just find t... | \n", "
6512 | \n", "VisualEditor: Automatic naming scheme for ref... | \n", "I realize that any automagic system will have ... | \n", "1385397795 | \n", "PHID-USER-uf3buojo4ceizjywvyn5 | \n", "True | \n", "PHID-TASK-j3rfh4pmjx4pel7dk2tn | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "155662 | \n", "155661.0 | \n", "2013-11-25 16:43:15+00:00 | \n", "True | \n", "False | \n", "I realize that any automagic system will have ... | \n", "I realize that any automagic system will have ... | \n", "
6513 | \n", "VisualEditor: Automatic naming scheme for ref... | \n", "Why humans need to be able to remember the ref... | \n", "1385397298 | \n", "PHID-USER-uf3buojo4ceizjywvyn5 | \n", "True | \n", "PHID-TASK-j3rfh4pmjx4pel7dk2tn | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "155663 | \n", "155662.0 | \n", "2013-11-25 16:34:58+00:00 | \n", "True | \n", "False | \n", "Why humans need to be able to remember the ref... | \n", "Why humans need to be able to remember the ref... | \n", "
6514 | \n", "VisualEditor: Automatic naming scheme for ref... | \n", "(In reply to comment #0)\\n> The ref naming sch... | \n", "1385394470 | \n", "PHID-USER-ydswvwhh5pm4lshahjje | \n", "True | \n", "PHID-TASK-j3rfh4pmjx4pel7dk2tn | \n", "task_subcomment | \n", "NaN | \n", "False | \n", "155664 | \n", "155663.0 | \n", "2013-11-25 15:47:50+00:00 | \n", "True | \n", "False | \n", "(In reply to comment #0)\\n> The ref naming sch... | \n", "(In reply to comment #0)\\n> The ref naming sch... | \n", "
6515 rows × 16 columns
\n", "\n", " | comment_id | \n", "timestamp | \n", "wmfAffil | \n", "token | \n", "dependency | \n", "head | \n", "depth | \n", "children | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "115 | \n", "2013-10-11 09:04:00+00:00 | \n", "False | \n", "use_api_login | \n", "dobj | \n", "use_api | \n", "1 | \n", "6 | \n", "
1 | \n", "157 | \n", "2013-10-07 08:09:00+00:00 | \n", "False | \n", "use_api_login | \n", "dobj | \n", "use_api | \n", "1 | \n", "4 | \n", "
2 | \n", "177 | \n", "2013-10-04 17:56:00+00:00 | \n", "False | \n", "certainly | \n", "advmod | \n", "require | \n", "2 | \n", "1 | \n", "
3 | \n", "247 | \n", "2013-09-27 22:15:00+00:00 | \n", "False | \n", "Login | \n", "ROOT | \n", "Login | \n", "0 | \n", "4 | \n", "
4 | \n", "426 | \n", "2013-09-01 11:26:00+00:00 | \n", "False | \n", "HTTP | \n", "compound | \n", "login | \n", "4 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1463 | \n", "45300 | \n", "2013-08-01 17:35:00+00:00 | \n", "False | \n", "certain | \n", "amod | \n", "commands | \n", "5 | \n", "0 | \n", "
1464 | \n", "45300 | \n", "2013-08-01 17:35:00+00:00 | \n", "False | \n", "certain | \n", "amod | \n", "commands | \n", "5 | \n", "0 | \n", "
1465 | \n", "45373 | \n", "2013-07-27 13:30:00+00:00 | \n", "False | \n", "certain | \n", "amod | \n", "element | \n", "8 | \n", "0 | \n", "
1466 | \n", "46078 | \n", "2013-06-18 21:17:00+00:00 | \n", "False | \n", "HTTP | \n", "compound | \n", "Error | \n", "2 | \n", "0 | \n", "
1467 | \n", "46086 | \n", "2013-06-19 23:31:02+00:00 | \n", "False | \n", "HTTP | \n", "compound | \n", "Error | \n", "3 | \n", "0 | \n", "
1468 rows × 8 columns
\n", "