1
0
mw-convo-collections/src/helper_scripts/cleaning_scripts/cleaning_phabricator.ipynb
2025-05-12 10:49:37 -05:00

276 lines
9.0 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6139"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fp =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/https_1_phab_data.json\"\n",
"with open(fp, 'r') as file:\n",
" data1 = json.load(file)\n",
"len(data1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9163"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fp1 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_06-14-2012_06-16-2013_phab_data.json\"\n",
"with open(fp1, 'r') as file:\n",
" data2 = json.load(file)\n",
"len(data2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4735"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fp2 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_09-01-2011_06-15-2012_phab_data.json\"\n",
"with open(fp2, 'r') as file:\n",
" data3 = json.load(file)\n",
"len(data3)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"data_list = [data1, data2, data3]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#loading in the master list of \n",
"import pandas as pd\n",
"roster_path = \"/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/src/helper_scripts/cleaning_scripts/022825_wmf_master_phab_roster.json\"\n",
"with open(roster_path, 'r') as file:\n",
" roster_data = json.load(file)\n",
"roster_df = pd.DataFrame(roster_data)\n",
"wmf_phids = roster_df['phid'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def generate_csv(data, output_file):\n",
" \"\"\"\n",
" Generate a CSV file with four columns: comment_text, date_created, AuthorPHID, TaskPHID, comment_type, status.\n",
" \n",
" Args:\n",
" data (list): The list of project data.\n",
" output_file (str): The path to the output CSV file.\n",
" \"\"\"\n",
" with open(output_file, 'w', newline='') as csvfile:\n",
" fieldnames = ['task_title', 'comment_text', 'date_created', 'AuthorPHID', 'WMFaffil', 'TaskPHID', 'comment_type', 'status']\n",
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
" \n",
" writer.writeheader()\n",
" \n",
" for project in data:\n",
" # Add project description as a comment\n",
" writer.writerow({\n",
" 'task_title': project['fields']['name'],\n",
" 'comment_text': project['fields']['description']['raw'],\n",
" 'date_created': project['fields']['dateCreated'],\n",
" 'AuthorPHID': project['fields']['authorPHID'],\n",
" 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
" 'TaskPHID' : project['phid'],\n",
" 'comment_type': 'task_description',\n",
" 'status': project['fields']['status']['value']\n",
" })\n",
" \n",
" # Add task comments\n",
" for key, value in project['task_comments'].items():\n",
" if value:\n",
" for subcomment in value:\n",
" writer.writerow({\n",
" 'task_title': project['fields']['name'],\n",
" 'comment_text': subcomment['content']['raw'],\n",
" 'date_created': subcomment['dateCreated'],\n",
" 'AuthorPHID': subcomment['authorPHID'],\n",
" 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
" 'TaskPHID' : project['phid'],\n",
" 'comment_type': 'task_subcomment',\n",
" 'status': 'NA'\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def concat_and_save(data_object_list, output_file):\n",
" # Use a set to track unique rows\n",
" unique_rows = set()\n",
"\n",
" with open(output_file, 'w', newline='') as csvfile:\n",
" fieldnames = ['task_title', 'comment_text', 'date_created', 'AuthorPHID', 'WMFaffil', 'TaskPHID', 'comment_type', 'status']\n",
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
" \n",
" writer.writeheader()\n",
" \n",
" for data in data_object_list:\n",
" for project in data:\n",
" # Add project description as a comment\n",
" row = {\n",
" 'task_title': project['fields']['name'],\n",
" 'comment_text': project['fields']['description']['raw'],\n",
" 'date_created': project['fields']['dateCreated'],\n",
" 'AuthorPHID': project['fields']['authorPHID'],\n",
" 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
" 'TaskPHID': project['phid'],\n",
" 'comment_type': 'task_description',\n",
" 'status': project['fields']['status']['value']\n",
" }\n",
" row_tuple = tuple(row.items())\n",
" if row_tuple not in unique_rows:\n",
" unique_rows.add(row_tuple)\n",
" writer.writerow(row)\n",
" \n",
" for key, value in project['task_comments'].items():\n",
" if value:\n",
" for subcomment in value:\n",
" row = {\n",
" 'task_title': project['fields']['name'],\n",
" 'comment_text': subcomment['content']['raw'],\n",
" 'date_created': subcomment['dateCreated'],\n",
" 'AuthorPHID': subcomment['authorPHID'],\n",
" 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
" 'TaskPHID': project['phid'],\n",
" 'comment_type': 'task_subcomment',\n",
" 'status': 'NA'\n",
" }\n",
" # Add row to the set if it's unique\n",
" row_tuple = tuple(row.items())\n",
" if row_tuple not in unique_rows:\n",
" unique_rows.add(row_tuple)\n",
" writer.writerow(row)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"concat_and_save(data_list, '0512_https_phab_comments.csv')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"generate_csv(data, \"0402_https2_phab_comments.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Minimum date_created: 1314866460\n",
"Maximum date_created: 1746664402\n"
]
}
],
"source": [
"df = pd.read_csv(\"0512_https_phab_comments.csv\")\n",
"# Convert the 'date_created' column to datetime format\n",
"\n",
"# Get the minimum and maximum date_created values\n",
"min_date = df['date_created'].min()\n",
"max_date = df['date_created'].max()\n",
"\n",
"print(f\"Minimum date_created: {min_date}\")\n",
"print(f\"Maximum date_created: {max_date}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}