From f68372572f5f2cfd985f992950b67bef4ddafef6 Mon Sep 17 00:00:00 2001 From: mgaughan Date: Sun, 14 Sep 2025 11:14:16 -0500 Subject: [PATCH] updating some scripts --- .../python_scripts/090425_batched_olmo_cat.py | 5 +++-- p2/quest/python_scripts/biberplus_labeling.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/p2/quest/python_scripts/090425_batched_olmo_cat.py b/p2/quest/python_scripts/090425_batched_olmo_cat.py index 2e05400..36c9b56 100644 --- a/p2/quest/python_scripts/090425_batched_olmo_cat.py +++ b/p2/quest/python_scripts/090425_batched_olmo_cat.py @@ -71,7 +71,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_ text_dict['task_title'] = row[1] text_dict['comment_text'] = row[2] text_dict['comment_type'] = row[12] - raw_text = text_dict['comment_text'] + raw_text = text_dict['task_title'] # comment_text preprocessing per https://arxiv.org/pdf/1902.07093 # 1. replace code with CODE @@ -91,6 +91,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_ comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text) # 5. split into an array of sentences comment_sentences = nltk.sent_tokenize(comment_text) + text_dict['cleaned_sentences'] = comment_sentences results = [] batch_size = 2 @@ -118,7 +119,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_ array_of_categorizations.append(text_dict) df = pd.DataFrame(array_of_categorizations) #print(df.head()) - df.to_csv('090425_olmo_batched_categorized.csv', index=False) + df.to_csv('titles_090725_olmo_batched_categorized.csv', index=False) diff --git a/p2/quest/python_scripts/biberplus_labeling.py b/p2/quest/python_scripts/biberplus_labeling.py index b6660a9..347f544 100644 --- a/p2/quest/python_scripts/biberplus_labeling.py +++ b/p2/quest/python_scripts/biberplus_labeling.py @@ -77,14 +77,16 @@ if __name__ == "__main__": #loading in the discussion data from the universal CSV first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") #formatting for the neurobiber model - docs = first_discussion_df["comment_text"].astype(str).tolist() + #docs = first_discussion_df["comment_text"].astype(str).tolist() + task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"] + docs = task_description_df['task_title'].astype(str).tolist() #load model and run #model, tokenizer = load_model_and_tokenizer() preds_df = biberplus_labeler(docs) #new columns in the df for the predicted neurobiber items #preds_cols = [f"neurobiber_{i+1}" for i in range(96)] #preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index) - final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1) + final_discussion_df = pd.concat([task_description_df, preds_df], axis=1) #print(type(preds)) #assigning the preditions as a new column ''' @@ -95,18 +97,18 @@ if __name__ == "__main__": how='inner' ) ''' - print(first_discussion_df) - print(final_discussion_df) + #print(first_discussion_df) + #print(final_discussion_df) #final_discussion_df["biberplus_preds"] = list(preds) #assert that order has been preserved for _ in range(1000): random_index = random.randrange(len(final_discussion_df)) - assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"] + assert task_description_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"] #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] #assert that there are the same number of rows in first_discussion_df and second_discussion_df - assert len(first_discussion_df) == len(final_discussion_df) + assert len(task_description_df) == len(final_discussion_df) final_discussion_df = final_discussion_df.drop(columns=["message"]) # if passing the prior asserts, let's write to a csv - final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_biberplus_labels.csv", index=False) + final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/090725_biberplus_title_labels.csv", index=False) print('biberplus labeling pau')