1
0
mw-convo-collections/data_cleaning/cleaning_rfcs.ipynb

453 lines
15 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import csv\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def recursive_comment_parsing(dict_array, subcomment):\n",
" # Append the current comment details to the dict_array\n",
" dict_array.append({\n",
" 'comment_text': subcomment.get('text_blocks', ''),\n",
" 'date_created': subcomment.get('time_stamp', ''),\n",
" 'Author': subcomment.get('author', '')\n",
" })\n",
" \n",
" # Process nested comments recursively\n",
" if 'comments' in subcomment and subcomment['comments']:\n",
" for nested_comment in subcomment['comments']:\n",
" recursive_comment_parsing(dict_array, nested_comment)\n",
" \n",
" return dict_array"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def generate_csv(data):\n",
" comments = []\n",
" for subsection in data:\n",
" if subsection['comments'] != []:\n",
" for subcomment in subsection['comments']:\n",
" comments = recursive_comment_parsing(comments, subcomment)\n",
" df = pd.DataFrame(comments)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"fp =\"/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/ve-feedback-2011-12.json\"\n",
"with open(fp, 'r') as file:\n",
" data = json.load(file)\n",
"data = data['sections']\n",
"test_df = generate_csv(data)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>comment_text</th>\n",
" <th>date_created</th>\n",
" <th>Author</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[{{talkarchive}}\\n, \\n]</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[== Link editing should work when cursor is on...</td>\n",
" <td>20:38, 13 December 2011 (UTC)</td>\n",
" <td>Dantman</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[\\n, :Yes, the little down-arrow with B/F/Link...</td>\n",
" <td>22:16, 13 December 2011 (UTC)</td>\n",
" <td>JakobVoss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[\\n]</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[== Collapse same text links ==\\n, \\n, &lt;small&gt;...</td>\n",
" <td>20:43, 13 December 2011 (UTC)</td>\n",
" <td>Dantman</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>429</th>\n",
" <td>[\\n]</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>430</th>\n",
" <td>[== User Interface ==\\n, \\n, &lt;small&gt;User agent...</td>\n",
" <td>12:02, 30 December 2011 (UTC)</td>\n",
" <td>TJRana</td>\n",
" </tr>\n",
" <tr>\n",
" <th>431</th>\n",
" <td>[\\n]</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>432</th>\n",
" <td>[== Feedback by users that see no feedback lin...</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>433</th>\n",
" <td>[== Usability ==\\n, \\n, It's great that you're...</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>434 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" comment_text \\\n",
"0 [{{talkarchive}}\\n, \\n] \n",
"1 [== Link editing should work when cursor is on... \n",
"2 [\\n, :Yes, the little down-arrow with B/F/Link... \n",
"3 [\\n] \n",
"4 [== Collapse same text links ==\\n, \\n, <small>... \n",
".. ... \n",
"429 [\\n] \n",
"430 [== User Interface ==\\n, \\n, <small>User agent... \n",
"431 [\\n] \n",
"432 [== Feedback by users that see no feedback lin... \n",
"433 [== Usability ==\\n, \\n, It's great that you're... \n",
"\n",
" date_created Author \n",
"0 \n",
"1 20:38, 13 December 2011 (UTC) Dantman \n",
"2 22:16, 13 December 2011 (UTC) JakobVoss \n",
"3 \n",
"4 20:43, 13 December 2011 (UTC) Dantman \n",
".. ... ... \n",
"429 \n",
"430 12:02, 30 December 2011 (UTC) TJRana \n",
"431 \n",
"432 \n",
"433 \n",
"\n",
"[434 rows x 3 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"fp2 =\"/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json\"\n",
"with open(fp2, 'r') as file2:\n",
" data2 = json.load(file2)\n",
"data2 = data2['sections']\n",
"test_df2 = generate_csv(data2)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: []\n",
"Index: []"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df2"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"df_combined = pd.concat([test_df, test_df2], ignore_index=True)\n",
"df_cleaned = df_combined[df_combined['Author'] != '']"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>comment_text</th>\n",
" <th>date_created</th>\n",
" <th>Author</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[==Parsoid working but getting JSON error==\\n,...</td>\n",
" <td>03:07, 31 May 2015 (UTC)</td>\n",
" <td>Alex Mashin</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[\\n, ----\\n, \\n, I find out that when you use ...</td>\n",
" <td>09:30, 20 Jun 2015 (UTC)</td>\n",
" <td>mnrahimi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[==Set up in web server?==\\n, My wiki is on th...</td>\n",
" <td>15:37, 24 June 2015 (UTC)</td>\n",
" <td>Arlolra</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>[== Multiple wikis ==\\n, Can several Mediawiki...</td>\n",
" <td>18:35, 13 October 2012 (UTC)</td>\n",
" <td>Amire80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>[\\n, : Yes. The Parsoid takes a prefix when th...</td>\n",
" <td>22:34, 15 October 2012 (UTC)</td>\n",
" <td>Jdforrester (WMF)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>[== Installation on OS X {{Resolved}} ==\\n, \\n...</td>\n",
" <td>13:53, 31 May 2015 (UTC)</td>\n",
" <td>Dieudo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>161</th>\n",
" <td>[\\n, : Resolved by [https://www.mediawiki.org/...</td>\n",
" <td>15:09, 24 June 2015 (UTC)</td>\n",
" <td>Arlolra</td>\n",
" </tr>\n",
" <tr>\n",
" <th>163</th>\n",
" <td>[== Error: Cannot find module '/etc/mediawiki/...</td>\n",
" <td>16:35, 29 October 2015 (UTC)</td>\n",
" <td>Krauss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>164</th>\n",
" <td>[\\n, : bin/server.js is in parsoid master and ...</td>\n",
" <td>16:37, 29 October 2015 (UTC)</td>\n",
" <td>SSastry (WMF)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>166</th>\n",
" <td>[==Can see VisualEditor interface but can't us...</td>\n",
" <td>00:44, 19 March 2016 (UTC)</td>\n",
" <td>2A01:E35:2F5E:A0E0:948F:579B:2435:72DB</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>106 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" comment_text \\\n",
"1 [==Parsoid working but getting JSON error==\\n,... \n",
"2 [\\n, ----\\n, \\n, I find out that when you use ... \n",
"4 [==Set up in web server?==\\n, My wiki is on th... \n",
"6 [== Multiple wikis ==\\n, Can several Mediawiki... \n",
"7 [\\n, : Yes. The Parsoid takes a prefix when th... \n",
".. ... \n",
"160 [== Installation on OS X {{Resolved}} ==\\n, \\n... \n",
"161 [\\n, : Resolved by [https://www.mediawiki.org/... \n",
"163 [== Error: Cannot find module '/etc/mediawiki/... \n",
"164 [\\n, : bin/server.js is in parsoid master and ... \n",
"166 [==Can see VisualEditor interface but can't us... \n",
"\n",
" date_created Author \n",
"1 03:07, 31 May 2015 (UTC) Alex Mashin \n",
"2 09:30, 20 Jun 2015 (UTC) mnrahimi \n",
"4 15:37, 24 June 2015 (UTC) Arlolra \n",
"6 18:35, 13 October 2012 (UTC) Amire80 \n",
"7 22:34, 15 October 2012 (UTC) Jdforrester (WMF) \n",
".. ... ... \n",
"160 13:53, 31 May 2015 (UTC) Dieudo \n",
"161 15:09, 24 June 2015 (UTC) Arlolra \n",
"163 16:35, 29 October 2015 (UTC) Krauss \n",
"164 16:37, 29 October 2015 (UTC) SSastry (WMF) \n",
"166 00:44, 19 March 2016 (UTC) 2A01:E35:2F5E:A0E0:948F:579B:2435:72DB \n",
"\n",
"[106 rows x 3 columns]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_cleaned"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"output_file = '0220_ve_rfcs_text.csv'\n",
"df_cleaned.to_csv(output_file, index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}