{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import json\n", "import csv\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def recursive_comment_parsing(dict_array, subcomment):\n", " # Append the current comment details to the dict_array\n", " dict_array.append({\n", " 'comment_text': subcomment.get('text_blocks', ''),\n", " 'date_created': subcomment.get('time_stamp', ''),\n", " 'Author': subcomment.get('author', '')\n", " })\n", " \n", " # Process nested comments recursively\n", " if 'comments' in subcomment and subcomment['comments']:\n", " for nested_comment in subcomment['comments']:\n", " recursive_comment_parsing(dict_array, nested_comment)\n", " \n", " return dict_array" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def generate_csv(data):\n", " comments = []\n", " for subsection in data:\n", " if subsection['comments'] != []:\n", " for subcomment in subsection['comments']:\n", " comments = recursive_comment_parsing(comments, subcomment)\n", " df = pd.DataFrame(comments)\n", " return df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "fp =\"/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/ve-feedback-2011-12.json\"\n", "with open(fp, 'r') as file:\n", " data = json.load(file)\n", "data = data['sections']\n", "test_df = generate_csv(data)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
comment_textdate_createdAuthor
0[{{talkarchive}}\\n, \\n]
1[== Link editing should work when cursor is on...20:38, 13 December 2011 (UTC)Dantman
2[\\n, :Yes, the little down-arrow with B/F/Link...22:16, 13 December 2011 (UTC)JakobVoss
3[\\n]
4[== Collapse same text links ==\\n, \\n, <small>...20:43, 13 December 2011 (UTC)Dantman
............
429[\\n]
430[== User Interface ==\\n, \\n, <small>User agent...12:02, 30 December 2011 (UTC)TJRana
431[\\n]
432[== Feedback by users that see no feedback lin...
433[== Usability ==\\n, \\n, It's great that you're...
\n", "

434 rows × 3 columns

\n", "
" ], "text/plain": [ " comment_text \\\n", "0 [{{talkarchive}}\\n, \\n] \n", "1 [== Link editing should work when cursor is on... \n", "2 [\\n, :Yes, the little down-arrow with B/F/Link... \n", "3 [\\n] \n", "4 [== Collapse same text links ==\\n, \\n, ... \n", ".. ... \n", "429 [\\n] \n", "430 [== User Interface ==\\n, \\n, User agent... \n", "431 [\\n] \n", "432 [== Feedback by users that see no feedback lin... \n", "433 [== Usability ==\\n, \\n, It's great that you're... \n", "\n", " date_created Author \n", "0 \n", "1 20:38, 13 December 2011 (UTC) Dantman \n", "2 22:16, 13 December 2011 (UTC) JakobVoss \n", "3 \n", "4 20:43, 13 December 2011 (UTC) Dantman \n", ".. ... ... \n", "429 \n", "430 12:02, 30 December 2011 (UTC) TJRana \n", "431 \n", "432 \n", "433 \n", "\n", "[434 rows x 3 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "fp2 =\"/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json\"\n", "with open(fp2, 'r') as file2:\n", " data2 = json.load(file2)\n", "data2 = data2['sections']\n", "test_df2 = generate_csv(data2)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: []\n", "Index: []" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df2" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "df_combined = pd.concat([test_df, test_df2], ignore_index=True)\n", "df_cleaned = df_combined[df_combined['Author'] != '']" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
comment_textdate_createdAuthor
1[==Parsoid working but getting JSON error==\\n,...03:07, 31 May 2015 (UTC)Alex Mashin
2[\\n, ----\\n, \\n, I find out that when you use ...09:30, 20 Jun 2015 (UTC)mnrahimi
4[==Set up in web server?==\\n, My wiki is on th...15:37, 24 June 2015 (UTC)Arlolra
6[== Multiple wikis ==\\n, Can several Mediawiki...18:35, 13 October 2012 (UTC)Amire80
7[\\n, : Yes. The Parsoid takes a prefix when th...22:34, 15 October 2012 (UTC)Jdforrester (WMF)
............
160[== Installation on OS X {{Resolved}} ==\\n, \\n...13:53, 31 May 2015 (UTC)Dieudo
161[\\n, : Resolved by [https://www.mediawiki.org/...15:09, 24 June 2015 (UTC)Arlolra
163[== Error: Cannot find module '/etc/mediawiki/...16:35, 29 October 2015 (UTC)Krauss
164[\\n, : bin/server.js is in parsoid master and ...16:37, 29 October 2015 (UTC)SSastry (WMF)
166[==Can see VisualEditor interface but can't us...00:44, 19 March 2016 (UTC)2A01:E35:2F5E:A0E0:948F:579B:2435:72DB
\n", "

106 rows × 3 columns

\n", "
" ], "text/plain": [ " comment_text \\\n", "1 [==Parsoid working but getting JSON error==\\n,... \n", "2 [\\n, ----\\n, \\n, I find out that when you use ... \n", "4 [==Set up in web server?==\\n, My wiki is on th... \n", "6 [== Multiple wikis ==\\n, Can several Mediawiki... \n", "7 [\\n, : Yes. The Parsoid takes a prefix when th... \n", ".. ... \n", "160 [== Installation on OS X {{Resolved}} ==\\n, \\n... \n", "161 [\\n, : Resolved by [https://www.mediawiki.org/... \n", "163 [== Error: Cannot find module '/etc/mediawiki/... \n", "164 [\\n, : bin/server.js is in parsoid master and ... \n", "166 [==Can see VisualEditor interface but can't us... \n", "\n", " date_created Author \n", "1 03:07, 31 May 2015 (UTC) Alex Mashin \n", "2 09:30, 20 Jun 2015 (UTC) mnrahimi \n", "4 15:37, 24 June 2015 (UTC) Arlolra \n", "6 18:35, 13 October 2012 (UTC) Amire80 \n", "7 22:34, 15 October 2012 (UTC) Jdforrester (WMF) \n", ".. ... ... \n", "160 13:53, 31 May 2015 (UTC) Dieudo \n", "161 15:09, 24 June 2015 (UTC) Arlolra \n", "163 16:35, 29 October 2015 (UTC) Krauss \n", "164 16:37, 29 October 2015 (UTC) SSastry (WMF) \n", "166 00:44, 19 March 2016 (UTC) 2A01:E35:2F5E:A0E0:948F:579B:2435:72DB \n", "\n", "[106 rows x 3 columns]" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_cleaned" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "output_file = '0220_ve_rfcs_text.csv'\n", "df_cleaned.to_csv(output_file, index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }