updating some scripts

2025-09-14 11:14:16 -05:00 · 2025-09-14 11:14:16 -05:00 · f68372572f
commit f68372572f
parent f9c12bb445
2 changed files with 12 additions and 9 deletions
--- a/p2/quest/python_scripts/090425_batched_olmo_cat.py
+++ b/p2/quest/python_scripts/090425_batched_olmo_cat.py
@ -71,7 +71,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
        text_dict['task_title'] = row[1]
        text_dict['comment_text'] = row[2]
        text_dict['comment_type'] = row[12]
-        raw_text = text_dict['comment_text']
+        raw_text = text_dict['task_title']
        
        # comment_text preprocessing per https://arxiv.org/pdf/1902.07093
        # 1. replace code with CODE
@ -91,6 +91,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
        comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
        # 5. split into an array of sentences
        comment_sentences = nltk.sent_tokenize(comment_text)
+        text_dict['cleaned_sentences'] = comment_sentences

        results = []
        batch_size = 2
@ -118,7 +119,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
        array_of_categorizations.append(text_dict)
    df = pd.DataFrame(array_of_categorizations)
    #print(df.head())
-    df.to_csv('090425_olmo_batched_categorized.csv', index=False)
+    df.to_csv('titles_090725_olmo_batched_categorized.csv', index=False)


 	    
--- a/p2/quest/python_scripts/biberplus_labeling.py
+++ b/p2/quest/python_scripts/biberplus_labeling.py
@ -77,14 +77,16 @@ if __name__ == "__main__":
    #loading in the discussion data from the universal CSV
    first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
    #formatting for the neurobiber model
-    docs = first_discussion_df["comment_text"].astype(str).tolist()
+    #docs = first_discussion_df["comment_text"].astype(str).tolist()
+    task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
+    docs = task_description_df['task_title'].astype(str).tolist()
    #load model and run
    #model, tokenizer = load_model_and_tokenizer()
    preds_df = biberplus_labeler(docs)
    #new columns in the df for the predicted neurobiber items 
    #preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
    #preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
-    final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
+    final_discussion_df = pd.concat([task_description_df, preds_df], axis=1)
    #print(type(preds))
    #assigning the preditions as a new column 
    '''
@ -95,18 +97,18 @@ if __name__ == "__main__":
        how='inner'
    )
    '''
-    print(first_discussion_df)
-    print(final_discussion_df)
+    #print(first_discussion_df)
+    #print(final_discussion_df)
    #final_discussion_df["biberplus_preds"] = list(preds)
    #assert that order has been preserved 
    for _ in range(1000):
        random_index = random.randrange(len(final_discussion_df))
-        assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
+        assert task_description_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
        #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
    #assert that there are the same number of rows in first_discussion_df and second_discussion_df
-    assert len(first_discussion_df) == len(final_discussion_df)
+    assert len(task_description_df) == len(final_discussion_df)
    final_discussion_df = final_discussion_df.drop(columns=["message"])
    # if passing the prior asserts, let's write to a csv
-    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_biberplus_labels.csv", index=False)
+    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/090725_biberplus_title_labels.csv", index=False)
    print('biberplus labeling pau')