1
0

adding biberplus labels

This commit is contained in:
mgaughan 2025-07-23 15:20:26 -05:00
parent edd17d3269
commit b0584ec1be
3 changed files with 151731 additions and 15 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,3 +1,36 @@
starting the job at: Tue Jul 22 16:43:27 CDT 2025
starting the job at: Wed Jul 23 14:49:04 CDT 2025
setting up the environment
running the biberplus labeling script
26024
26024
id ... http_flag
0 56791 ... NaN
1 269631 ... NaN
2 269628 ... NaN
3 269622 ... NaN
4 56737 ... NaN
... ... ... ...
26019 403186 ... True
26020 78646 ... True
26021 429163 ... True
26022 429137 ... True
26023 418783 ... True
[26024 rows x 22 columns]
id ... message
0 56791 ... pawn character editing\n\nseen on master branc...
1 269631 ... Change 86685 merged by jenkins-bot:\nFollow-up...
2 269628 ... *** Bug 54785 has been marked as a duplicate o...
3 269622 ... Change 86685 had a related patch set uploaded ...
4 56737 ... **Author:** `Wikifram`\n\n**Description:**\nAf...
... ... ... ...
26019 403186 ... Could you attach a screenshot please? Drag & d...
26020 78646 ... Hi,\n\nWe have a wiki which has a part which c...
26021 429163 ... Sorry for not reply-ing. I did a test and coul...
26022 429137 ... @DikkieDick: Please answer.
26023 418783 ... I cannot replicate this. What's the name of th...
[26024 rows x 121 columns]
biberplus labeling pau
job finished, cleaning up
job pau at: Wed Jul 23 14:58:09 CDT 2025

View File

@ -39,7 +39,6 @@ def biberplus_labeler(text):
config = load_config()
config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
pipeline = load_pipeline(config)
#test = ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis']
features_list = []
for message in text:
message_label = calculate_tag_frequencies(message, pipeline, config)
@ -48,18 +47,11 @@ def biberplus_labeler(text):
features_list.append(mean_row)
print(len(features_list))
frequencies_df = pd.DataFrame(features_list)
frequencies_df['comment_text'] = text
frequencies_df['message'] = text
frequencies_df = frequencies_df.reset_index(drop=True)
return frequencies_df
if __name__ == "__main__":
#https://huggingface.co/Blablablab/neurobiber
'''
docs = [
"First text goes here.",
"Second text, slightly different style."
]
'''
#loading in the discussion data from the universal CSV
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
#formatting for the neurobiber model
@ -81,15 +73,18 @@ if __name__ == "__main__":
how='inner'
)
'''
print(len(final_discussion_df))
print(first_discussion_df)
print(final_discussion_df)
#final_discussion_df["biberplus_preds"] = list(preds)
#assert that order has been preserved
for _ in range(10):
random_index = random.choice(first_discussion_df.index)
assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
for _ in range(1000):
random_index = random.randrange(len(final_discussion_df))
assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
#assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
assert len(first_discussion_df) == len(final_discussion_df)
final_discussion_df = final_discussion_df.drop(columns=["message"])
# if passing the prior asserts, let's write to a csv
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False)
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False)
print('biberplus labeling pau')