276 lines
9.0 KiB
Plaintext
276 lines
9.0 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"import csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"6139"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fp =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/https_1_phab_data.json\"\n",
|
|
"with open(fp, 'r') as file:\n",
|
|
" data1 = json.load(file)\n",
|
|
"len(data1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"9163"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fp1 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_06-14-2012_06-16-2013_phab_data.json\"\n",
|
|
"with open(fp1, 'r') as file:\n",
|
|
" data2 = json.load(file)\n",
|
|
"len(data2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"4735"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fp2 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_09-01-2011_06-15-2012_phab_data.json\"\n",
|
|
"with open(fp2, 'r') as file:\n",
|
|
" data3 = json.load(file)\n",
|
|
"len(data3)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_list = [data1, data2, data3]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#loading in the master list of \n",
|
|
"import pandas as pd\n",
|
|
"roster_path = \"/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/src/helper_scripts/cleaning_scripts/022825_wmf_master_phab_roster.json\"\n",
|
|
"with open(roster_path, 'r') as file:\n",
|
|
" roster_data = json.load(file)\n",
|
|
"roster_df = pd.DataFrame(roster_data)\n",
|
|
"wmf_phids = roster_df['phid'].tolist()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def generate_csv(data, output_file):\n",
|
|
" \"\"\"\n",
|
|
" Generate a CSV file with four columns: comment_text, date_created, AuthorPHID, TaskPHID, comment_type, status.\n",
|
|
" \n",
|
|
" Args:\n",
|
|
" data (list): The list of project data.\n",
|
|
" output_file (str): The path to the output CSV file.\n",
|
|
" \"\"\"\n",
|
|
" with open(output_file, 'w', newline='') as csvfile:\n",
|
|
" fieldnames = ['task_title', 'comment_text', 'date_created', 'AuthorPHID', 'WMFaffil', 'TaskPHID', 'comment_type', 'status']\n",
|
|
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
|
|
" \n",
|
|
" writer.writeheader()\n",
|
|
" \n",
|
|
" for project in data:\n",
|
|
" # Add project description as a comment\n",
|
|
" writer.writerow({\n",
|
|
" 'task_title': project['fields']['name'],\n",
|
|
" 'comment_text': project['fields']['description']['raw'],\n",
|
|
" 'date_created': project['fields']['dateCreated'],\n",
|
|
" 'AuthorPHID': project['fields']['authorPHID'],\n",
|
|
" 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
|
|
" 'TaskPHID' : project['phid'],\n",
|
|
" 'comment_type': 'task_description',\n",
|
|
" 'status': project['fields']['status']['value']\n",
|
|
" })\n",
|
|
" \n",
|
|
" # Add task comments\n",
|
|
" for key, value in project['task_comments'].items():\n",
|
|
" if value:\n",
|
|
" for subcomment in value:\n",
|
|
" writer.writerow({\n",
|
|
" 'task_title': project['fields']['name'],\n",
|
|
" 'comment_text': subcomment['content']['raw'],\n",
|
|
" 'date_created': subcomment['dateCreated'],\n",
|
|
" 'AuthorPHID': subcomment['authorPHID'],\n",
|
|
" 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
|
|
" 'TaskPHID' : project['phid'],\n",
|
|
" 'comment_type': 'task_subcomment',\n",
|
|
" 'status': 'NA'\n",
|
|
" })"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def concat_and_save(data_object_list, output_file):\n",
|
|
" # Use a set to track unique rows\n",
|
|
" unique_rows = set()\n",
|
|
"\n",
|
|
" with open(output_file, 'w', newline='') as csvfile:\n",
|
|
" fieldnames = ['task_title', 'comment_text', 'date_created', 'AuthorPHID', 'WMFaffil', 'TaskPHID', 'comment_type', 'status']\n",
|
|
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
|
|
" \n",
|
|
" writer.writeheader()\n",
|
|
" \n",
|
|
" for data in data_object_list:\n",
|
|
" for project in data:\n",
|
|
" # Add project description as a comment\n",
|
|
" row = {\n",
|
|
" 'task_title': project['fields']['name'],\n",
|
|
" 'comment_text': project['fields']['description']['raw'],\n",
|
|
" 'date_created': project['fields']['dateCreated'],\n",
|
|
" 'AuthorPHID': project['fields']['authorPHID'],\n",
|
|
" 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
|
|
" 'TaskPHID': project['phid'],\n",
|
|
" 'comment_type': 'task_description',\n",
|
|
" 'status': project['fields']['status']['value']\n",
|
|
" }\n",
|
|
" row_tuple = tuple(row.items())\n",
|
|
" if row_tuple not in unique_rows:\n",
|
|
" unique_rows.add(row_tuple)\n",
|
|
" writer.writerow(row)\n",
|
|
" \n",
|
|
" for key, value in project['task_comments'].items():\n",
|
|
" if value:\n",
|
|
" for subcomment in value:\n",
|
|
" row = {\n",
|
|
" 'task_title': project['fields']['name'],\n",
|
|
" 'comment_text': subcomment['content']['raw'],\n",
|
|
" 'date_created': subcomment['dateCreated'],\n",
|
|
" 'AuthorPHID': subcomment['authorPHID'],\n",
|
|
" 'WMFaffil': project['fields']['authorPHID'] in wmf_phids,\n",
|
|
" 'TaskPHID': project['phid'],\n",
|
|
" 'comment_type': 'task_subcomment',\n",
|
|
" 'status': 'NA'\n",
|
|
" }\n",
|
|
" # Add row to the set if it's unique\n",
|
|
" row_tuple = tuple(row.items())\n",
|
|
" if row_tuple not in unique_rows:\n",
|
|
" unique_rows.add(row_tuple)\n",
|
|
" writer.writerow(row)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"concat_and_save(data_list, '0512_https_phab_comments.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"generate_csv(data, \"0402_https2_phab_comments.csv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Minimum date_created: 1314866460\n",
|
|
"Maximum date_created: 1746664402\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.read_csv(\"0512_https_phab_comments.csv\")\n",
|
|
"# Convert the 'date_created' column to datetime format\n",
|
|
"\n",
|
|
"# Get the minimum and maximum date_created values\n",
|
|
"min_date = df['date_created'].min()\n",
|
|
"max_date = df['date_created'].max()\n",
|
|
"\n",
|
|
"print(f\"Minimum date_created: {min_date}\")\n",
|
|
"print(f\"Maximum date_created: {max_date}\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|