updating labeling sample to be, uh, correct
This commit is contained in:
parent
d83022f184
commit
89969daab5
File diff suppressed because it is too large
Load Diff
@ -71,7 +71,10 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
||||
text_dict['task_title'] = row[1]
|
||||
text_dict['comment_text'] = row[2]
|
||||
text_dict['comment_type'] = row[12]
|
||||
raw_text = text_dict['task_title']
|
||||
if text_dict['comment_type'] == "task_description":
|
||||
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
|
||||
else:
|
||||
raw_text = text_dict['comment_text']
|
||||
|
||||
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
||||
# 1. replace code with CODE
|
||||
@ -119,7 +122,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
||||
array_of_categorizations.append(text_dict)
|
||||
df = pd.DataFrame(array_of_categorizations)
|
||||
#print(df.head())
|
||||
df.to_csv('titles_090725_olmo_batched_categorized.csv', index=False)
|
||||
df.to_csv('all_091625_olmo_batched_categorized.csv', index=False)
|
||||
|
||||
|
||||
|
||||
|
@ -71,7 +71,11 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
||||
text_dict['task_title'] = row[1]
|
||||
text_dict['comment_text'] = row[2]
|
||||
text_dict['comment_type'] = row[12]
|
||||
raw_text = text_dict['task_title']
|
||||
#making sure the comment title is included in things
|
||||
if text_dict['comment_type'] == "task_description":
|
||||
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
|
||||
else:
|
||||
raw_text = text_dict['comment_text']
|
||||
|
||||
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
||||
# 1. replace code with CODE
|
||||
@ -120,7 +124,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
||||
array_of_categorizations.append(text_dict)
|
||||
df = pd.DataFrame(array_of_categorizations)
|
||||
random_df = df.sample(n=300, random_state=8)
|
||||
random_df.to_csv('091425_human_text_sample.csv', index=False)
|
||||
random_df.to_csv('091625_human_text_sample.csv', index=False)
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user