updating collection scripts/data

2025-05-18 18:57:24 -05:00 · 2025-05-18 18:57:24 -05:00 · 993bbe658b
commit 993bbe658b
parent 9c7ab02e3d
3 changed files with 917955 additions and 15 deletions
--- a/src/helper_scripts/cleaning_scripts/0514_https_phab_comments.csv
+++ b/src/helper_scripts/cleaning_scripts/0514_https_phab_comments.csv
--- a/src/helper_scripts/cleaning_scripts/cleaning_phabricator.ipynb
+++ b/src/helper_scripts/cleaning_scripts/cleaning_phabricator.ipynb
@ -81,16 +81,39 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1760"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "data_list = [data1, data2, data3]"
+    "fp3 =\"/data/users/mgaughan/mw-repo-lifecycles/phab_data/https2013/http_10-21-2013_12-5-2013_phab_data.json\"\n",
+    "with open(fp3, 'r') as file:\n",
+    "        data4 = json.load(file)\n",
+    "len(data4)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_list = [data1, data2, data3, data4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -105,7 +128,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@ -154,7 +177,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@ -208,11 +231,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
-    "concat_and_save(data_list, '0512_https_phab_comments.csv')"
+    "concat_and_save(data_list, '0514_https_phab_comments.csv')"
   ]
  },
  {
@ -226,7 +249,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@ -234,12 +257,12 @@
     "output_type": "stream",
     "text": [
      "Minimum date_created: 1314866460\n",
-      "Maximum date_created: 1746664402\n"
+      "Maximum date_created: 1746822176\n"
     ]
    }
   ],
   "source": [
-    "df = pd.read_csv(\"0512_https_phab_comments.csv\")\n",
+    "df = pd.read_csv(\"0514_https_phab_comments.csv\")\n",
    "# Convert the 'date_created' column to datetime format\n",
    "\n",
    "# Get the minimum and maximum date_created values\n",
--- a/src/lib/phab_get.py
+++ b/src/lib/phab_get.py
@ -106,7 +106,7 @@ def query_transactions_phid_task(
            'limit':limit,
            'after':after,
        }
-        response = requests.get( api_url_base, params=params)
+        response = requests.get(api_url_base, params=params)
        try:
            result = json.loads(response.text)['result']
            data_tmp = result['data']
@ -179,8 +179,8 @@ if __name__ == "__main__":
    api_base = 'https://phabricator.wikimedia.org/api/'

    #p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2011, 9, 1, 0, 0, 0)))
-    p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2012, 6, 14, 0, 0, 0)))
-    p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2013, 6, 16, 0, 0, 0)))
+    p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2013, 10, 21, 0, 0, 0)))
+    p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2013, 12, 5, 0, 0, 0)))

    p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
    for entry in p_data:
@ -192,7 +192,7 @@ if __name__ == "__main__":
            comments[item['id']] = item['comments']
        entry['task_comments'] = comments
    DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
-    with open(f"{DATA_PREFIX}{tag}_06-14-2012_06-16-2013_phab_data.json", "w") as outfile1:
+    with open(f"{DATA_PREFIX}{tag}_10-21-2013_12-5-2013_phab_data.json", "w") as outfile1:
        json.dump(p_data, outfile1)
    '''
    user = query_users()