1
0

updating labeling sample to be, uh, correct

This commit is contained in:
mgaughan 2025-09-16 11:43:28 -05:00
parent d83022f184
commit 89969daab5
3 changed files with 310 additions and 303 deletions

View File

@ -71,7 +71,10 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
text_dict['task_title'] = row[1]
text_dict['comment_text'] = row[2]
text_dict['comment_type'] = row[12]
raw_text = text_dict['task_title']
if text_dict['comment_type'] == "task_description":
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
else:
raw_text = text_dict['comment_text']
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
# 1. replace code with CODE
@ -119,7 +122,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
array_of_categorizations.append(text_dict)
df = pd.DataFrame(array_of_categorizations)
#print(df.head())
df.to_csv('titles_090725_olmo_batched_categorized.csv', index=False)
df.to_csv('all_091625_olmo_batched_categorized.csv', index=False)

View File

@ -71,7 +71,11 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
text_dict['task_title'] = row[1]
text_dict['comment_text'] = row[2]
text_dict['comment_type'] = row[12]
raw_text = text_dict['task_title']
#making sure the comment title is included in things
if text_dict['comment_type'] == "task_description":
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
else:
raw_text = text_dict['comment_text']
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
# 1. replace code with CODE
@ -120,7 +124,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
array_of_categorizations.append(text_dict)
df = pd.DataFrame(array_of_categorizations)
random_df = df.sample(n=300, random_state=8)
random_df.to_csv('091425_human_text_sample.csv', index=False)
random_df.to_csv('091625_human_text_sample.csv', index=False)