adding biberplus labels
This commit is contained in:
parent
edd17d3269
commit
b0584ec1be
151688
p2/quest/072325_biberplus_labels.csv
Normal file
151688
p2/quest/072325_biberplus_labels.csv
Normal file
File diff suppressed because one or more lines are too long
@ -1,3 +1,36 @@
|
||||
starting the job at: Tue Jul 22 16:43:27 CDT 2025
|
||||
starting the job at: Wed Jul 23 14:49:04 CDT 2025
|
||||
setting up the environment
|
||||
running the biberplus labeling script
|
||||
26024
|
||||
26024
|
||||
id ... http_flag
|
||||
0 56791 ... NaN
|
||||
1 269631 ... NaN
|
||||
2 269628 ... NaN
|
||||
3 269622 ... NaN
|
||||
4 56737 ... NaN
|
||||
... ... ... ...
|
||||
26019 403186 ... True
|
||||
26020 78646 ... True
|
||||
26021 429163 ... True
|
||||
26022 429137 ... True
|
||||
26023 418783 ... True
|
||||
|
||||
[26024 rows x 22 columns]
|
||||
id ... message
|
||||
0 56791 ... pawn character editing\n\nseen on master branc...
|
||||
1 269631 ... Change 86685 merged by jenkins-bot:\nFollow-up...
|
||||
2 269628 ... *** Bug 54785 has been marked as a duplicate o...
|
||||
3 269622 ... Change 86685 had a related patch set uploaded ...
|
||||
4 56737 ... **Author:** `Wikifram`\n\n**Description:**\nAf...
|
||||
... ... ... ...
|
||||
26019 403186 ... Could you attach a screenshot please? Drag & d...
|
||||
26020 78646 ... Hi,\n\nWe have a wiki which has a part which c...
|
||||
26021 429163 ... Sorry for not reply-ing. I did a test and coul...
|
||||
26022 429137 ... @DikkieDick: Please answer.
|
||||
26023 418783 ... I cannot replicate this. What's the name of th...
|
||||
|
||||
[26024 rows x 121 columns]
|
||||
biberplus labeling pau
|
||||
job finished, cleaning up
|
||||
job pau at: Wed Jul 23 14:58:09 CDT 2025
|
||||
|
@ -39,7 +39,6 @@ def biberplus_labeler(text):
|
||||
config = load_config()
|
||||
config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
|
||||
pipeline = load_pipeline(config)
|
||||
#test = ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis']
|
||||
features_list = []
|
||||
for message in text:
|
||||
message_label = calculate_tag_frequencies(message, pipeline, config)
|
||||
@ -48,18 +47,11 @@ def biberplus_labeler(text):
|
||||
features_list.append(mean_row)
|
||||
print(len(features_list))
|
||||
frequencies_df = pd.DataFrame(features_list)
|
||||
frequencies_df['comment_text'] = text
|
||||
frequencies_df['message'] = text
|
||||
frequencies_df = frequencies_df.reset_index(drop=True)
|
||||
return frequencies_df
|
||||
|
||||
if __name__ == "__main__":
|
||||
#https://huggingface.co/Blablablab/neurobiber
|
||||
'''
|
||||
docs = [
|
||||
"First text goes here.",
|
||||
"Second text, slightly different style."
|
||||
]
|
||||
'''
|
||||
#loading in the discussion data from the universal CSV
|
||||
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
||||
#formatting for the neurobiber model
|
||||
@ -81,15 +73,18 @@ if __name__ == "__main__":
|
||||
how='inner'
|
||||
)
|
||||
'''
|
||||
print(len(final_discussion_df))
|
||||
print(first_discussion_df)
|
||||
print(final_discussion_df)
|
||||
#final_discussion_df["biberplus_preds"] = list(preds)
|
||||
#assert that order has been preserved
|
||||
for _ in range(10):
|
||||
random_index = random.choice(first_discussion_df.index)
|
||||
assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
|
||||
for _ in range(1000):
|
||||
random_index = random.randrange(len(final_discussion_df))
|
||||
assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
|
||||
#assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
|
||||
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
|
||||
assert len(first_discussion_df) == len(final_discussion_df)
|
||||
final_discussion_df = final_discussion_df.drop(columns=["message"])
|
||||
# if passing the prior asserts, let's write to a csv
|
||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False)
|
||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False)
|
||||
print('biberplus labeling pau')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user