1
0
mw-lifecycle-analysis/text_analysis/case3/.ipynb_checkpoints/050825_join_resolved_files-checkpoint.ipynb

488 lines
20 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "fcc726a8-44a4-48cf-a1cd-937b05bd4d08",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1fceca29-48c1-4ba3-93ba-88724dea22a7",
"metadata": {},
"outputs": [],
"source": [
"first_resolved_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/051725_coref_rel_phab_comments_to_2014.csv\"\n",
"first_resolved_df = pd.read_csv(first_resolved_path)\n",
"second_resolved_path = \"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/051725_coref_rel_phab_comments_2014_to_2015.csv\"\n",
"second_resolved_df = pd.read_csv(second_resolved_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f26c31e7-bee1-4100-821f-769e5b1791bd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8621"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(second_resolved_df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "dfa81ca2-4d66-4679-bc3e-192d0cac67fa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5007"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(first_resolved_df)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6dc11bda-f0f6-4eb6-96f5-02ed9a3492ba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13628"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined_df = pd.concat([first_resolved_df, second_resolved_df])\n",
"unique_df = combined_df.drop_duplicates()\n",
"len(unique_df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0c903199-8159-455c-aa7f-e57ef07ce03e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>task_title</th>\n",
" <th>comment_text</th>\n",
" <th>date_created</th>\n",
" <th>speaker</th>\n",
" <th>meta.affil</th>\n",
" <th>conversation_id</th>\n",
" <th>comment_type</th>\n",
" <th>status</th>\n",
" <th>meta.gerrit</th>\n",
" <th>id</th>\n",
" <th>reply_to</th>\n",
" <th>timestamp</th>\n",
" <th>is_relevant</th>\n",
" <th>is_migrated</th>\n",
" <th>text</th>\n",
" <th>resolved_text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
" <td>1411541280</td>\n",
" <td>PHID-USER-v7vgzvvcw7v2umf737ri</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_description</td>\n",
" <td>resolved</td>\n",
" <td>False</td>\n",
" <td>243215</td>\n",
" <td>NaN</td>\n",
" <td>2014-09-24 06:48:00+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
" <td>User:NickK reported in IRC that they're gettin...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>Revert has been deployed.</td>\n",
" <td>1411573104</td>\n",
" <td>PHID-USER-v7vgzvvcw7v2umf737ri</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>243216</td>\n",
" <td>243215.0</td>\n",
" <td>2014-09-24 15:38:24+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Revert has been deployed.</td>\n",
" <td>Revert has been deployed.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
" <td>1411572378</td>\n",
" <td>PHID-USER-ynivjflmc2dcl6w5ut5v</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>243217</td>\n",
" <td>243216.0</td>\n",
" <td>2014-09-24 15:26:18+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162550 merged...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
" <td>1411545535</td>\n",
" <td>PHID-USER-v7bwpq3rs3zdxegibdbh</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>243218</td>\n",
" <td>243217.0</td>\n",
" <td>2014-09-24 07:58:55+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
" <td>(In reply to Kunal Mehta (Legoktm) from commen...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>User with unattached accounts unable to login ...</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
" <td>1411542640</td>\n",
" <td>PHID-USER-ynivjflmc2dcl6w5ut5v</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-mio2uq45ny7mms72syut</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>243219</td>\n",
" <td>243218.0</td>\n",
" <td>2014-09-24 07:10:40+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
" <td>**gerritadmin** wrote:\\n\\nChange 162549 merged...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8616</th>\n",
" <td>OAuth login refers to mediawiki.org:/ instead ...</td>\n",
" <td>&gt; When I registered, phabricator linked mediaw...</td>\n",
" <td>1413205650</td>\n",
" <td>PHID-USER-hgn5uw2jafgjgfvxibhh</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-yeaxsfxhhtbn26koo5fi</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>378799</td>\n",
" <td>378798.0</td>\n",
" <td>2014-10-13 13:07:30+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>&gt; When I registered, phabricator linked mediaw...</td>\n",
" <td>&gt; When I registered, phabricator linked mediaw...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8617</th>\n",
" <td>OAuth login refers to mediawiki.org:/ instead ...</td>\n",
" <td>See {T574} for a related discussion.</td>\n",
" <td>1412958953</td>\n",
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-yeaxsfxhhtbn26koo5fi</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>378800</td>\n",
" <td>378799.0</td>\n",
" <td>2014-10-10 16:35:53+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>See {T574} for a related discussion.</td>\n",
" <td>See {T574} for a related discussion.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8618</th>\n",
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
" <td>1412362816</td>\n",
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
" <td>task_description</td>\n",
" <td>declined</td>\n",
" <td>False</td>\n",
" <td>378858</td>\n",
" <td>NaN</td>\n",
" <td>2014-10-03 19:00:16+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
" <td>Some improvements to the Wikimedia SUL dialog:...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8619</th>\n",
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
" <td>I guess the same restrictions as in T543 apply...</td>\n",
" <td>1412415111</td>\n",
" <td>PHID-USER-lluzkul4z7us4sxkayss</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>378860</td>\n",
" <td>378859.0</td>\n",
" <td>2014-10-04 09:31:51+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>I guess the same restrictions as in T543 apply...</td>\n",
" <td>I guess the same restrictions as in T543 apply...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8620</th>\n",
" <td>Improvements to Wikimedia SUL login dialog UI:...</td>\n",
" <td>It's not entirely trivial to change</td>\n",
" <td>1412366627</td>\n",
" <td>PHID-USER-fn7qnpccfbitivgtw2rt</td>\n",
" <td>False</td>\n",
" <td>PHID-TASK-j6czqxlv5fzcx3tmq23n</td>\n",
" <td>task_subcomment</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>378861</td>\n",
" <td>378860.0</td>\n",
" <td>2014-10-03 20:03:47+00:00</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>It's not entirely trivial to change</td>\n",
" <td>It's not entirely trivial to change</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>13628 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" task_title \\\n",
"0 User with unattached accounts unable to login ... \n",
"1 User with unattached accounts unable to login ... \n",
"2 User with unattached accounts unable to login ... \n",
"3 User with unattached accounts unable to login ... \n",
"4 User with unattached accounts unable to login ... \n",
"... ... \n",
"8616 OAuth login refers to mediawiki.org:/ instead ... \n",
"8617 OAuth login refers to mediawiki.org:/ instead ... \n",
"8618 Improvements to Wikimedia SUL login dialog UI:... \n",
"8619 Improvements to Wikimedia SUL login dialog UI:... \n",
"8620 Improvements to Wikimedia SUL login dialog UI:... \n",
"\n",
" comment_text date_created \\\n",
"0 User:NickK reported in IRC that they're gettin... 1411541280 \n",
"1 Revert has been deployed. 1411573104 \n",
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... 1411572378 \n",
"3 (In reply to Kunal Mehta (Legoktm) from commen... 1411545535 \n",
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... 1411542640 \n",
"... ... ... \n",
"8616 > When I registered, phabricator linked mediaw... 1413205650 \n",
"8617 See {T574} for a related discussion. 1412958953 \n",
"8618 Some improvements to the Wikimedia SUL dialog:... 1412362816 \n",
"8619 I guess the same restrictions as in T543 apply... 1412415111 \n",
"8620 It's not entirely trivial to change 1412366627 \n",
"\n",
" speaker meta.affil \\\n",
"0 PHID-USER-v7vgzvvcw7v2umf737ri False \n",
"1 PHID-USER-v7vgzvvcw7v2umf737ri False \n",
"2 PHID-USER-ynivjflmc2dcl6w5ut5v False \n",
"3 PHID-USER-v7bwpq3rs3zdxegibdbh False \n",
"4 PHID-USER-ynivjflmc2dcl6w5ut5v False \n",
"... ... ... \n",
"8616 PHID-USER-hgn5uw2jafgjgfvxibhh False \n",
"8617 PHID-USER-lluzkul4z7us4sxkayss False \n",
"8618 PHID-USER-lluzkul4z7us4sxkayss False \n",
"8619 PHID-USER-lluzkul4z7us4sxkayss False \n",
"8620 PHID-USER-fn7qnpccfbitivgtw2rt False \n",
"\n",
" conversation_id comment_type status meta.gerrit \\\n",
"0 PHID-TASK-mio2uq45ny7mms72syut task_description resolved False \n",
"1 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
"2 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
"3 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
"4 PHID-TASK-mio2uq45ny7mms72syut task_subcomment NaN False \n",
"... ... ... ... ... \n",
"8616 PHID-TASK-yeaxsfxhhtbn26koo5fi task_subcomment NaN False \n",
"8617 PHID-TASK-yeaxsfxhhtbn26koo5fi task_subcomment NaN False \n",
"8618 PHID-TASK-j6czqxlv5fzcx3tmq23n task_description declined False \n",
"8619 PHID-TASK-j6czqxlv5fzcx3tmq23n task_subcomment NaN False \n",
"8620 PHID-TASK-j6czqxlv5fzcx3tmq23n task_subcomment NaN False \n",
"\n",
" id reply_to timestamp is_relevant is_migrated \\\n",
"0 243215 NaN 2014-09-24 06:48:00+00:00 True False \n",
"1 243216 243215.0 2014-09-24 15:38:24+00:00 True False \n",
"2 243217 243216.0 2014-09-24 15:26:18+00:00 True False \n",
"3 243218 243217.0 2014-09-24 07:58:55+00:00 True False \n",
"4 243219 243218.0 2014-09-24 07:10:40+00:00 True False \n",
"... ... ... ... ... ... \n",
"8616 378799 378798.0 2014-10-13 13:07:30+00:00 True False \n",
"8617 378800 378799.0 2014-10-10 16:35:53+00:00 True False \n",
"8618 378858 NaN 2014-10-03 19:00:16+00:00 True False \n",
"8619 378860 378859.0 2014-10-04 09:31:51+00:00 True False \n",
"8620 378861 378860.0 2014-10-03 20:03:47+00:00 True False \n",
"\n",
" text \\\n",
"0 User:NickK reported in IRC that they're gettin... \n",
"1 Revert has been deployed. \n",
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... \n",
"3 (In reply to Kunal Mehta (Legoktm) from commen... \n",
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... \n",
"... ... \n",
"8616 > When I registered, phabricator linked mediaw... \n",
"8617 See {T574} for a related discussion. \n",
"8618 Some improvements to the Wikimedia SUL dialog:... \n",
"8619 I guess the same restrictions as in T543 apply... \n",
"8620 It's not entirely trivial to change \n",
"\n",
" resolved_text \n",
"0 User:NickK reported in IRC that they're gettin... \n",
"1 Revert has been deployed. \n",
"2 **gerritadmin** wrote:\\n\\nChange 162550 merged... \n",
"3 (In reply to Kunal Mehta (Legoktm) from commen... \n",
"4 **gerritadmin** wrote:\\n\\nChange 162549 merged... \n",
"... ... \n",
"8616 > When I registered, phabricator linked mediaw... \n",
"8617 See {T574} for a related discussion. \n",
"8618 Some improvements to the Wikimedia SUL dialog:... \n",
"8619 I guess the same restrictions as in T543 apply... \n",
"8620 It's not entirely trivial to change \n",
"\n",
"[13628 rows x 16 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unique_df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0c392d70-6236-4dfe-b6d4-bbe3f422b09e",
"metadata": {},
"outputs": [],
"source": [
"unique_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/0050825_coref-rel-first.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}