updating labeling sample to be, uh, correct

2025-09-16 11:43:28 -05:00 · 2025-09-16 11:43:28 -05:00 · 89969daab5
commit 89969daab5
parent d83022f184
3 changed files with 310 additions and 303 deletions
--- a/dsl/091625_human_text_sample.csv
+++ b/dsl/091625_human_text_sample.csv
--- a/p2/quest/python_scripts/090425_batched_olmo_cat.py
+++ b/p2/quest/python_scripts/090425_batched_olmo_cat.py
@ -71,7 +71,10 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
        text_dict['task_title'] = row[1]
        text_dict['comment_text'] = row[2]
        text_dict['comment_type'] = row[12]
-        raw_text = text_dict['task_title']
+        if text_dict['comment_type'] == "task_description":
+            raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
+        else: 
+            raw_text = text_dict['comment_text']
        
        # comment_text preprocessing per https://arxiv.org/pdf/1902.07093
        # 1. replace code with CODE
@ -119,7 +122,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
        array_of_categorizations.append(text_dict)
    df = pd.DataFrame(array_of_categorizations)
    #print(df.head())
-    df.to_csv('titles_090725_olmo_batched_categorized.csv', index=False)
+    df.to_csv('all_091625_olmo_batched_categorized.csv', index=False)


 	    
--- a/p2/quest/python_scripts/label_sampling.py
+++ b/p2/quest/python_scripts/label_sampling.py
@ -71,7 +71,11 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
        text_dict['task_title'] = row[1]
        text_dict['comment_text'] = row[2]
        text_dict['comment_type'] = row[12]
-        raw_text = text_dict['task_title']
+        #making sure the comment title is included in things
+        if text_dict['comment_type'] == "task_description":
+            raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
+        else:
+            raw_text = text_dict['comment_text']
        
        # comment_text preprocessing per https://arxiv.org/pdf/1902.07093
        # 1. replace code with CODE
@ -120,7 +124,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
        array_of_categorizations.append(text_dict)
    df = pd.DataFrame(array_of_categorizations)
    random_df = df.sample(n=300, random_state=8)
-    random_df.to_csv('091425_human_text_sample.csv', index=False)
+    random_df.to_csv('091625_human_text_sample.csv', index=False)