scaffolding out some work TODO on getting the olmo categories to be sentence-level
This commit is contained in:
parent
53775c51db
commit
2d396ceb26
@ -19,7 +19,7 @@ tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B")
|
|||||||
|
|
||||||
#TODO: text_preprocessing per https://arxiv.org/pdf/1902.07093
|
#TODO: text_preprocessing per https://arxiv.org/pdf/1902.07093
|
||||||
|
|
||||||
priming = "For the **GIVEN COMMENT**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference.Your task is to match the **GIVEN COMMENT** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the category as your output. Do not provide any text beyond the category name."
|
priming = "For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference.Your task is to match the **GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the category as your output. Do not provide any text beyond the category name."
|
||||||
#the typology descriptions are taken straight from https://arxiv.org/pdf/1902.07093
|
#the typology descriptions are taken straight from https://arxiv.org/pdf/1902.07093
|
||||||
typology = """
|
typology = """
|
||||||
TYPOLOGY:
|
TYPOLOGY:
|
||||||
@ -58,7 +58,7 @@ TYPOLOGY:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
#instructions="Only respond with the GIVEN COMMENT's [[CATEGORY]] classification. Do not provide any more information."
|
#instructions="Only respond with the GIVEN COMMENT's [[CATEGORY]] classification. Do not provide any more information."
|
||||||
instructions="The comment's category is: "
|
instructions="The sentence's category is: "
|
||||||
|
|
||||||
with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", mode='r', newline='') as file:
|
with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", mode='r', newline='') as file:
|
||||||
reader = csv.reader(file)
|
reader = csv.reader(file)
|
||||||
@ -93,8 +93,10 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
|
|||||||
comment_text = re.sub(url_pattern, 'URL', comment_text)
|
comment_text = re.sub(url_pattern, 'URL', comment_text)
|
||||||
# 4. if possible, replace @ with SCREEN_NAME
|
# 4. if possible, replace @ with SCREEN_NAME
|
||||||
comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
|
comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
|
||||||
|
# TODO: 5. split into an array of sentences
|
||||||
#print(comment_text)
|
#print(comment_text)
|
||||||
|
|
||||||
|
#TODO: do this for each sentence in the comment
|
||||||
text_dict['cleaned_comment_text'] = comment_text
|
text_dict['cleaned_comment_text'] = comment_text
|
||||||
#build out prompt construction; more specificity in data provided
|
#build out prompt construction; more specificity in data provided
|
||||||
given_data = f"**GIVEN COMMENT: \n ' Type -{text_dict['comment_type']} \n Text -{text_dict['cleaned_comment_text']}**'\n"
|
given_data = f"**GIVEN COMMENT: \n ' Type -{text_dict['comment_type']} \n Text -{text_dict['cleaned_comment_text']}**'\n"
|
||||||
@ -106,7 +108,6 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
|
|||||||
#deterministic sampling and getting the response back
|
#deterministic sampling and getting the response back
|
||||||
response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False)
|
response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False)
|
||||||
response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0]
|
response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0]
|
||||||
|
|
||||||
#print("this is the response:::: ----------------------------")
|
#print("this is the response:::: ----------------------------")
|
||||||
#print(response_txt)
|
#print(response_txt)
|
||||||
#getting the resulting codes
|
#getting the resulting codes
|
||||||
@ -118,6 +119,9 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
|
|||||||
else:
|
else:
|
||||||
following_text = "NO CATEGORY"
|
following_text = "NO CATEGORY"
|
||||||
#print(following_text)
|
#print(following_text)
|
||||||
|
|
||||||
|
# TODO: collate olmo categories back together into an ordered list
|
||||||
|
# TODO: add the list of sentence-level olmo categories into dictionary
|
||||||
text_dict['olmo_category'] = following_text
|
text_dict['olmo_category'] = following_text
|
||||||
'''
|
'''
|
||||||
for item in result.strip(";").split(";"):
|
for item in result.strip(";").split(";"):
|
||||||
@ -130,6 +134,6 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_lab
|
|||||||
array_of_categorizations.append(text_dict)
|
array_of_categorizations.append(text_dict)
|
||||||
#CSV everything
|
#CSV everything
|
||||||
df = pd.DataFrame(array_of_categorizations)
|
df = pd.DataFrame(array_of_categorizations)
|
||||||
df.to_csv('072525_olmo_messages_categorized.csv', index=False)
|
#df.to_csv('072525_olmo_messages_categorized.csv', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user