{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import csv" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6139" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fp =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/https_1_phab_data.json\"\n", "with open(fp, 'r') as file:\n", " data1 = json.load(file)\n", "len(data1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9163" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fp1 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_06-14-2012_06-16-2013_phab_data.json\"\n", "with open(fp1, 'r') as file:\n", " data2 = json.load(file)\n", "len(data2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4735" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fp2 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_09-01-2011_06-15-2012_phab_data.json\"\n", "with open(fp2, 'r') as file:\n", " data3 = json.load(file)\n", "len(data3)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data_list = [data1, data2, data3]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#loading in the master list of \n", "import pandas as pd\n", "roster_path = \"/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/src/helper_scripts/cleaning_scripts/022825_wmf_master_phab_roster.json\"\n", "with open(roster_path, 'r') as file:\n", " roster_data = json.load(file)\n", "roster_df = pd.DataFrame(roster_data)\n", "wmf_phids = roster_df['phid'].tolist()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def generate_csv(data, output_file):\n", " \"\"\"\n", " Generate a CSV file with four columns: comment_text, date_created, AuthorPHID, TaskPHID, comment_type, status.\n", " \n", " Args:\n", " data (list): The list of project data.\n", " output_file (str): The path to the output CSV file.\n", " \"\"\"\n", " with open(output_file, 'w', newline='') as csvfile:\n", " fieldnames = ['task_title', 'comment_text', 'date_created', 'AuthorPHID', 'WMFaffil', 'TaskPHID', 'comment_type', 'status']\n", " writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n", " \n", " writer.writeheader()\n", " \n", " for project in data:\n", " # Add project description as a comment\n", " writer.writerow({\n", " 'task_title': project['fields']['name'],\n", " 'comment_text': project['fields']['description']['raw'],\n", " 'date_created': project['fields']['dateCreated'],\n", " 'AuthorPHID': project['fields']['authorPHID'],\n", " 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n", " 'TaskPHID' : project['phid'],\n", " 'comment_type': 'task_description',\n", " 'status': project['fields']['status']['value']\n", " })\n", " \n", " # Add task comments\n", " for key, value in project['task_comments'].items():\n", " if value:\n", " for subcomment in value:\n", " writer.writerow({\n", " 'task_title': project['fields']['name'],\n", " 'comment_text': subcomment['content']['raw'],\n", " 'date_created': subcomment['dateCreated'],\n", " 'AuthorPHID': subcomment['authorPHID'],\n", " 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n", " 'TaskPHID' : project['phid'],\n", " 'comment_type': 'task_subcomment',\n", " 'status': 'NA'\n", " })" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def concat_and_save(data_object_list, output_file):\n", " # Use a set to track unique rows\n", " unique_rows = set()\n", "\n", " with open(output_file, 'w', newline='') as csvfile:\n", " fieldnames = ['task_title', 'comment_text', 'date_created', 'AuthorPHID', 'WMFaffil', 'TaskPHID', 'comment_type', 'status']\n", " writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n", " \n", " writer.writeheader()\n", " \n", " for data in data_object_list:\n", " for project in data:\n", " # Add project description as a comment\n", " row = {\n", " 'task_title': project['fields']['name'],\n", " 'comment_text': project['fields']['description']['raw'],\n", " 'date_created': project['fields']['dateCreated'],\n", " 'AuthorPHID': project['fields']['authorPHID'],\n", " 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n", " 'TaskPHID': project['phid'],\n", " 'comment_type': 'task_description',\n", " 'status': project['fields']['status']['value']\n", " }\n", " row_tuple = tuple(row.items())\n", " if row_tuple not in unique_rows:\n", " unique_rows.add(row_tuple)\n", " writer.writerow(row)\n", " \n", " for key, value in project['task_comments'].items():\n", " if value:\n", " for subcomment in value:\n", " row = {\n", " 'task_title': project['fields']['name'],\n", " 'comment_text': subcomment['content']['raw'],\n", " 'date_created': subcomment['dateCreated'],\n", " 'AuthorPHID': subcomment['authorPHID'],\n", " 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n", " 'TaskPHID': project['phid'],\n", " 'comment_type': 'task_subcomment',\n", " 'status': 'NA'\n", " }\n", " # Add row to the set if it's unique\n", " row_tuple = tuple(row.items())\n", " if row_tuple not in unique_rows:\n", " unique_rows.add(row_tuple)\n", " writer.writerow(row)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "concat_and_save(data_list, '0512_https_phab_comments.csv')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "generate_csv(data, \"0402_https2_phab_comments.csv\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Minimum date_created: 1314866460\n", "Maximum date_created: 1746664402\n" ] } ], "source": [ "df = pd.read_csv(\"0512_https_phab_comments.csv\")\n", "# Convert the 'date_created' column to datetime format\n", "\n", "# Get the minimum and maximum date_created values\n", "min_date = df['date_created'].min()\n", "max_date = df['date_created'].max()\n", "\n", "print(f\"Minimum date_created: {min_date}\")\n", "print(f\"Maximum date_created: {max_date}\")" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }