updating labeling sample to be, uh, correct
This commit is contained in:
parent
d83022f184
commit
89969daab5
File diff suppressed because it is too large
Load Diff
@ -71,7 +71,10 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
text_dict['task_title'] = row[1]
|
text_dict['task_title'] = row[1]
|
||||||
text_dict['comment_text'] = row[2]
|
text_dict['comment_text'] = row[2]
|
||||||
text_dict['comment_type'] = row[12]
|
text_dict['comment_type'] = row[12]
|
||||||
raw_text = text_dict['task_title']
|
if text_dict['comment_type'] == "task_description":
|
||||||
|
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
|
||||||
|
else:
|
||||||
|
raw_text = text_dict['comment_text']
|
||||||
|
|
||||||
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
||||||
# 1. replace code with CODE
|
# 1. replace code with CODE
|
||||||
@ -119,7 +122,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
array_of_categorizations.append(text_dict)
|
array_of_categorizations.append(text_dict)
|
||||||
df = pd.DataFrame(array_of_categorizations)
|
df = pd.DataFrame(array_of_categorizations)
|
||||||
#print(df.head())
|
#print(df.head())
|
||||||
df.to_csv('titles_090725_olmo_batched_categorized.csv', index=False)
|
df.to_csv('all_091625_olmo_batched_categorized.csv', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,7 +71,11 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
text_dict['task_title'] = row[1]
|
text_dict['task_title'] = row[1]
|
||||||
text_dict['comment_text'] = row[2]
|
text_dict['comment_text'] = row[2]
|
||||||
text_dict['comment_type'] = row[12]
|
text_dict['comment_type'] = row[12]
|
||||||
raw_text = text_dict['task_title']
|
#making sure the comment title is included in things
|
||||||
|
if text_dict['comment_type'] == "task_description":
|
||||||
|
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
|
||||||
|
else:
|
||||||
|
raw_text = text_dict['comment_text']
|
||||||
|
|
||||||
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
||||||
# 1. replace code with CODE
|
# 1. replace code with CODE
|
||||||
@ -120,7 +124,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
array_of_categorizations.append(text_dict)
|
array_of_categorizations.append(text_dict)
|
||||||
df = pd.DataFrame(array_of_categorizations)
|
df = pd.DataFrame(array_of_categorizations)
|
||||||
random_df = df.sample(n=300, random_state=8)
|
random_df = df.sample(n=300, random_state=8)
|
||||||
random_df.to_csv('091425_human_text_sample.csv', index=False)
|
random_df.to_csv('091625_human_text_sample.csv', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user