From cb2fe737cdae92e822f829efc57411e61b563fd9 Mon Sep 17 00:00:00 2001
From: mgaughan <mgaughan@proton.me>
Date: Sat, 11 Oct 2025 07:38:11 -0500
Subject: [PATCH] updating batching script, preparing for run

---
 p2/quest/101025-batched-mw-olmo-info-cat.log      | 15 +++++++++++++++
 .../python_scripts/090425_batched_olmo_cat.py     | 13 +++++++++----
 p2/quest/slurm_jobs/090425_olmo_batched_cat.sh    |  2 +-
 3 files changed, 25 insertions(+), 5 deletions(-)
 create mode 100644 p2/quest/101025-batched-mw-olmo-info-cat.log
diff --git a/p2/quest/101025-batched-mw-olmo-info-cat.log b/p2/quest/101025-batched-mw-olmo-info-cat.log
new file mode 100644
index 0000000..33e557d
--- /dev/null
+++ b/p2/quest/101025-batched-mw-olmo-info-cat.log
@@ -0,0 +1,15 @@
+setting up the environment by loading in conda environment at Sat Oct 11 00:24:37 CDT 2025
+running the batched olmo categorization job at Sat Oct 11 00:24:37 CDT 2025
+[nltk_data] Downloading package punkt_tab to
+[nltk_data]     /home/nws8519/nltk_data...
+[nltk_data]   Package punkt_tab is already up-to-date!
+cuda
+NVIDIA A100-SXM4-80GB
+_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=393ab5c3-2bcb-e4c6-52ad-eb4896a9d4fe, L2_cache_size=40MB)
+Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]Loading checkpoint shards:   8%|▊         | 1/12 [00:00<00:03,  2.77it/s]Loading checkpoint shards:  17%|█▋        | 2/12 [00:00<00:04,  2.16it/s]Loading checkpoint shards:  25%|██▌       | 3/12 [00:01<00:04,  1.93it/s]Loading checkpoint shards:  33%|███▎      | 4/12 [00:02<00:04,  1.79it/s]Loading checkpoint shards:  42%|████▏     | 5/12 [00:02<00:03,  1.77it/s]Loading checkpoint shards:  50%|█████     | 6/12 [00:03<00:03,  1.80it/s]Loading checkpoint shards:  58%|█████▊    | 7/12 [00:03<00:02,  1.80it/s]Loading checkpoint shards:  67%|██████▋   | 8/12 [00:04<00:02,  1.76it/s]Loading checkpoint shards:  75%|███████▌  | 9/12 [00:04<00:01,  1.77it/s]Loading checkpoint shards:  83%|████████▎ | 10/12 [00:05<00:01,  1.82it/s]Loading checkpoint shards:  92%|█████████▏| 11/12 [00:05<00:00,  1.92it/s]Loading checkpoint shards: 100%|██████████| 12/12 [00:05<00:00,  2.02it/s]
+Traceback (most recent call last):
+  File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py", line 62, in <module>
+    with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", mode='r', newline='') as file:
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+FileNotFoundError: [Errno 2] No such file or directory: '/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv'
+unsupervised batched olmo categorization pau at Sat Oct 11 00:27:22 CDT 2025
diff --git a/p2/quest/python_scripts/090425_batched_olmo_cat.py b/p2/quest/python_scripts/090425_batched_olmo_cat.py
index 8066565..b81e8e6 100644
--- a/p2/quest/python_scripts/090425_batched_olmo_cat.py
+++ b/p2/quest/python_scripts/090425_batched_olmo_cat.py
@@ -40,6 +40,8 @@ TYPOLOGY:
 [[TASK PROGRESS]], in which stakeholders request or report progress of tasks and sub-tasks towards the solution of the issue. For example, “I made an initial stab at it... - this is just a proof of concept that gets the version string into nodejs. I’ll start working on adding the swig interfaces...”
 	
 [[TESTING]], in which participants discuss the testing procedure and results, as well as the system environment, code, data, and feedback involved in testing. For example, “Tested on ‘0.101’ and ‘master’ - the issue seems to be fixed on ‘master’ not just for the example document, but for the entire corpus...”
+
+[[NA]], in which the sentence contents are entirely incomprehensible or only consist of punctuation or numerals. For example, "***", "ve-ce-protectedNode", or "T8597".
 	
 [[FUTURE PLAN]], in which participants discuss the long-term plan related to the issue; such plans usually involve work/ideas that are not required to close the current issue. For example, “For the futures, stay tuned, as we’re prototyping something in this direction.”
 	
@@ -57,7 +59,7 @@ TYPOLOGY:
 """
 instructions="The sentence's category is: "
 
-with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", mode='r', newline='') as file:
+with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", mode='r', newline='') as file:
     reader = csv.reader(file)
     array_of_categorizations = []
     index = -1
@@ -70,7 +72,10 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
         text_dict['id'] = row[0]
         text_dict['task_title'] = row[1]
         text_dict['comment_text'] = row[2]
-        text_dict['comment_type'] = row[12]
+        text_dict['date_created'] = row[3]
+        text_dict['comment_type'] = row[6]
+        text_dict['TaskPHID'] = row[5]
+        text_dict['AuthorPHID'] = row[4]
         if text_dict['comment_type'] == "task_description":
             raw_text = text_dict['task_title'] + ". \n\n" + text_dict['comment_text']
         else: 
@@ -102,7 +107,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
             batch = comment_sentences[i:i+batch_size]
             prompts = []
             for sent in batch:
-                given_data = f"**GIVEN SENTENCE: \n ' Type -text_dict['comment_type']  \n Text -{sent}**'\n"
+                given_data = f"**GIVEN SENTENCE: \n ' Type -text_dict['task_title']  \n Text -{sent}**'\n"
                 prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}"
                 prompts.append(prompt)
             inputs = tokenizer(prompts, return_tensors='pt', return_token_type_ids=False, padding=True, truncation=True).to(device)
@@ -122,7 +127,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
         array_of_categorizations.append(text_dict)
     df = pd.DataFrame(array_of_categorizations)
     #print(df.head())
-    df.to_csv('all_092225_olmo_batched_categorized.csv', index=False)
+    df.to_csv('all_101025_olmo_batched_categorized.csv', index=False)
 
 
 	    
diff --git a/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh b/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh
index c251c09..098bd97 100644
--- a/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh
+++ b/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh
@@ -9,7 +9,7 @@
 #SBATCH --mem=64G
 #SBATCH --cpus-per-task=4
 #SBATCH --job-name=batched-MW-info-typology
-#SBATCH --output=batched-mw-olmo-info-cat.log
+#SBATCH --output=101025-batched-mw-olmo-info-cat.log
 #SBATCH --mail-type=BEGIN,END,FAIL
 #SBATCH --mail-user=gaughan@u.northwestern.edu