updating some scripts
This commit is contained in:
parent
f9c12bb445
commit
f68372572f
@ -71,7 +71,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
text_dict['task_title'] = row[1]
|
text_dict['task_title'] = row[1]
|
||||||
text_dict['comment_text'] = row[2]
|
text_dict['comment_text'] = row[2]
|
||||||
text_dict['comment_type'] = row[12]
|
text_dict['comment_type'] = row[12]
|
||||||
raw_text = text_dict['comment_text']
|
raw_text = text_dict['task_title']
|
||||||
|
|
||||||
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
# comment_text preprocessing per https://arxiv.org/pdf/1902.07093
|
||||||
# 1. replace code with CODE
|
# 1. replace code with CODE
|
||||||
@ -91,6 +91,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
|
comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
|
||||||
# 5. split into an array of sentences
|
# 5. split into an array of sentences
|
||||||
comment_sentences = nltk.sent_tokenize(comment_text)
|
comment_sentences = nltk.sent_tokenize(comment_text)
|
||||||
|
text_dict['cleaned_sentences'] = comment_sentences
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
batch_size = 2
|
batch_size = 2
|
||||||
@ -118,7 +119,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
array_of_categorizations.append(text_dict)
|
array_of_categorizations.append(text_dict)
|
||||||
df = pd.DataFrame(array_of_categorizations)
|
df = pd.DataFrame(array_of_categorizations)
|
||||||
#print(df.head())
|
#print(df.head())
|
||||||
df.to_csv('090425_olmo_batched_categorized.csv', index=False)
|
df.to_csv('titles_090725_olmo_batched_categorized.csv', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,14 +77,16 @@ if __name__ == "__main__":
|
|||||||
#loading in the discussion data from the universal CSV
|
#loading in the discussion data from the universal CSV
|
||||||
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
||||||
#formatting for the neurobiber model
|
#formatting for the neurobiber model
|
||||||
docs = first_discussion_df["comment_text"].astype(str).tolist()
|
#docs = first_discussion_df["comment_text"].astype(str).tolist()
|
||||||
|
task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
|
||||||
|
docs = task_description_df['task_title'].astype(str).tolist()
|
||||||
#load model and run
|
#load model and run
|
||||||
#model, tokenizer = load_model_and_tokenizer()
|
#model, tokenizer = load_model_and_tokenizer()
|
||||||
preds_df = biberplus_labeler(docs)
|
preds_df = biberplus_labeler(docs)
|
||||||
#new columns in the df for the predicted neurobiber items
|
#new columns in the df for the predicted neurobiber items
|
||||||
#preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
|
#preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
|
||||||
#preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
|
#preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
|
||||||
final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
|
final_discussion_df = pd.concat([task_description_df, preds_df], axis=1)
|
||||||
#print(type(preds))
|
#print(type(preds))
|
||||||
#assigning the preditions as a new column
|
#assigning the preditions as a new column
|
||||||
'''
|
'''
|
||||||
@ -95,18 +97,18 @@ if __name__ == "__main__":
|
|||||||
how='inner'
|
how='inner'
|
||||||
)
|
)
|
||||||
'''
|
'''
|
||||||
print(first_discussion_df)
|
#print(first_discussion_df)
|
||||||
print(final_discussion_df)
|
#print(final_discussion_df)
|
||||||
#final_discussion_df["biberplus_preds"] = list(preds)
|
#final_discussion_df["biberplus_preds"] = list(preds)
|
||||||
#assert that order has been preserved
|
#assert that order has been preserved
|
||||||
for _ in range(1000):
|
for _ in range(1000):
|
||||||
random_index = random.randrange(len(final_discussion_df))
|
random_index = random.randrange(len(final_discussion_df))
|
||||||
assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
|
assert task_description_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
|
||||||
#assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
|
#assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
|
||||||
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
|
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
|
||||||
assert len(first_discussion_df) == len(final_discussion_df)
|
assert len(task_description_df) == len(final_discussion_df)
|
||||||
final_discussion_df = final_discussion_df.drop(columns=["message"])
|
final_discussion_df = final_discussion_df.drop(columns=["message"])
|
||||||
# if passing the prior asserts, let's write to a csv
|
# if passing the prior asserts, let's write to a csv
|
||||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_biberplus_labels.csv", index=False)
|
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/090725_biberplus_title_labels.csv", index=False)
|
||||||
print('biberplus labeling pau')
|
print('biberplus labeling pau')
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user