mw-convo-collections/src/helper_scripts/cleaning_scripts/cleaning_phabricator.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6139"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fp =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/https_1_phab_data.json\"\n",
    "with open(fp, 'r') as file:\n",
    "        data1 = json.load(file)\n",
    "len(data1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9163"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fp1 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_06-14-2012_06-16-2013_phab_data.json\"\n",
    "with open(fp1, 'r') as file:\n",
    "        data2 = json.load(file)\n",
    "len(data2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4735"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fp2 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_09-01-2011_06-15-2012_phab_data.json\"\n",
    "with open(fp2, 'r') as file:\n",
    "        data3 = json.load(file)\n",
    "len(data3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list = [data1, data2, data3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#loading in the master list of \n",
    "import pandas as pd\n",
    "roster_path = \"/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/src/helper_scripts/cleaning_scripts/022825_wmf_master_phab_roster.json\"\n",
    "with open(roster_path, 'r') as file:\n",
    "    roster_data = json.load(file)\n",
    "roster_df = pd.DataFrame(roster_data)\n",
    "wmf_phids = roster_df['phid'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_csv(data, output_file):\n",
    "    \"\"\"\n",
    "    Generate a CSV file with four columns: comment_text, date_created, AuthorPHID, TaskPHID, comment_type, status.\n",
    "    \n",
    "    Args:\n",
    "    data (list): The list of project data.\n",
    "    output_file (str): The path to the output CSV file.\n",
    "    \"\"\"\n",
    "    with open(output_file, 'w', newline='') as csvfile:\n",
    "        fieldnames = ['task_title', 'comment_text', 'date_created', 'AuthorPHID', 'WMFaffil', 'TaskPHID', 'comment_type', 'status']\n",
    "        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
    "        \n",
    "        writer.writeheader()\n",
    "        \n",
    "        for project in data:\n",
    "            # Add project description as a comment\n",
    "            writer.writerow({\n",
    "                'task_title': project['fields']['name'],\n",
    "                'comment_text': project['fields']['description']['raw'],\n",
    "                'date_created': project['fields']['dateCreated'],\n",
    "                'AuthorPHID': project['fields']['authorPHID'],\n",
    "                'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
    "                'TaskPHID' : project['phid'],\n",
    "                'comment_type': 'task_description',\n",
    "                'status': project['fields']['status']['value']\n",
    "            })\n",
    "            \n",
    "            # Add task comments\n",
    "            for key, value in project['task_comments'].items():\n",
    "                if value:\n",
    "                    for subcomment in value:\n",
    "                        writer.writerow({\n",
    "                            'task_title': project['fields']['name'],\n",
    "                            'comment_text': subcomment['content']['raw'],\n",
    "                            'date_created': subcomment['dateCreated'],\n",
    "                            'AuthorPHID': subcomment['authorPHID'],\n",
    "                            'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
    "                            'TaskPHID' : project['phid'],\n",
    "                            'comment_type': 'task_subcomment',\n",
    "                            'status': 'NA'\n",
    "                        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def concat_and_save(data_object_list, output_file):\n",
    "    # Use a set to track unique rows\n",
    "    unique_rows = set()\n",
    "\n",
    "    with open(output_file, 'w', newline='') as csvfile:\n",
    "        fieldnames = ['task_title', 'comment_text', 'date_created', 'AuthorPHID', 'WMFaffil', 'TaskPHID', 'comment_type', 'status']\n",
    "        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
    "        \n",
    "        writer.writeheader()\n",
    "        \n",
    "        for data in data_object_list:\n",
    "            for project in data:\n",
    "                # Add project description as a comment\n",
    "                row = {\n",
    "                    'task_title': project['fields']['name'],\n",
    "                    'comment_text': project['fields']['description']['raw'],\n",
    "                    'date_created': project['fields']['dateCreated'],\n",
    "                    'AuthorPHID': project['fields']['authorPHID'],\n",
    "                    'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
    "                    'TaskPHID': project['phid'],\n",
    "                    'comment_type': 'task_description',\n",
    "                    'status': project['fields']['status']['value']\n",
    "                }\n",
    "                row_tuple = tuple(row.items())\n",
    "                if row_tuple not in unique_rows:\n",
    "                    unique_rows.add(row_tuple)\n",
    "                    writer.writerow(row)\n",
    "                \n",
    "                for key, value in project['task_comments'].items():\n",
    "                    if value:\n",
    "                        for subcomment in value:\n",
    "                            row = {\n",
    "                                'task_title': project['fields']['name'],\n",
    "                                'comment_text': subcomment['content']['raw'],\n",
    "                                'date_created': subcomment['dateCreated'],\n",
    "                                'AuthorPHID': subcomment['authorPHID'],\n",
    "                                'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
    "                                'TaskPHID': project['phid'],\n",
    "                                'comment_type': 'task_subcomment',\n",
    "                                'status': 'NA'\n",
    "                            }\n",
    "                            # Add row to the set if it's unique\n",
    "                            row_tuple = tuple(row.items())\n",
    "                            if row_tuple not in unique_rows:\n",
    "                                unique_rows.add(row_tuple)\n",
    "                                writer.writerow(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_and_save(data_list, '0512_https_phab_comments.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "generate_csv(data, \"0402_https2_phab_comments.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Minimum date_created: 1314866460\n",
      "Maximum date_created: 1746664402\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(\"0512_https_phab_comments.csv\")\n",
    "# Convert the 'date_created' column to datetime format\n",
    "\n",
    "# Get the minimum and maximum date_created values\n",
    "min_date = df['date_created'].min()\n",
    "max_date = df['date_created'].max()\n",
    "\n",
    "print(f\"Minimum date_created: {min_date}\")\n",
    "print(f\"Maximum date_created: {max_date}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}