From 2d396ceb26b439880bea4d7cf31850f0cc85d8a3 Mon Sep 17 00:00:00 2001
From: mgaughan <mgaughan@proton.me>
Date: Tue, 2 Sep 2025 12:48:11 -0500
Subject: [PATCH] scaffolding out some work TODO on getting the olmo categories
 to be sentence-level

---
 p2/quest/python_scripts/info_labeling.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/p2/quest/python_scripts/info_labeling.py b/p2/quest/python_scripts/info_labeling.py
index ca31a17..4ecb20b 100644
--- a/p2/quest/python_scripts/info_labeling.py
+++ b/p2/quest/python_scripts/info_labeling.py
@@ -19,7 +19,7 @@ tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B")
 
 #TODO: text_preprocessing per https://arxiv.org/pdf/1902.07093
 
-priming = "For the **GIVEN COMMENT**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference.Your task is to match the **GIVEN COMMENT** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the category as your output. Do not provide any text beyond the category name."
+priming = "For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference.Your task is to match the **GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the category as your output. Do not provide any text beyond the category name."
 #the typology descriptions are taken straight from https://arxiv.org/pdf/1902.07093
 typology = """
 TYPOLOGY: 
@@ -58,7 +58,7 @@ TYPOLOGY:
 """
 
 #instructions="Only respond with the GIVEN COMMENT's [[CATEGORY]] classification. Do not provide any more information."
-instructions="The comment's category is: "
+instructions="The sentence's category is: "
 
 with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", mode='r', newline='') as file:
     reader = csv.reader(file)
@@ -93,8 +93,10 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
         comment_text = re.sub(url_pattern, 'URL', comment_text) 
         # 4. if possible, replace @ with SCREEN_NAME
         comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
+        # TODO: 5. split into an array of sentences
         #print(comment_text)
         
+        #TODO: do this for each sentence in the comment
         text_dict['cleaned_comment_text'] = comment_text
         #build out prompt construction; more specificity in data provided
         given_data = f"**GIVEN COMMENT: \n ' Type -{text_dict['comment_type']} \n Text -{text_dict['cleaned_comment_text']}**'\n"
@@ -106,7 +108,6 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
         #deterministic sampling and getting the response back 
         response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False)
         response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0]
-        
         #print("this is the response:::: ----------------------------")
         #print(response_txt)
         #getting the resulting codes 
@@ -118,6 +119,9 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
         else: 
             following_text = "NO CATEGORY"
         #print(following_text)
+
+        # TODO: collate olmo categories back together into an ordered list 
+        # TODO: add the list of sentence-level olmo categories into dictionary
         text_dict['olmo_category'] = following_text
         '''
         for item in result.strip(";").split(";"):
@@ -130,6 +134,6 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
         array_of_categorizations.append(text_dict)
     #CSV everything
     df = pd.DataFrame(array_of_categorizations)
-    df.to_csv('072525_olmo_messages_categorized.csv', index=False)
+    #df.to_csv('072525_olmo_messages_categorized.csv', index=False)