adding biberplus labels
This commit is contained in:
parent
edd17d3269
commit
b0584ec1be
151688
p2/quest/072325_biberplus_labels.csv
Normal file
151688
p2/quest/072325_biberplus_labels.csv
Normal file
File diff suppressed because one or more lines are too long
@ -1,3 +1,36 @@
|
|||||||
starting the job at: Tue Jul 22 16:43:27 CDT 2025
|
starting the job at: Wed Jul 23 14:49:04 CDT 2025
|
||||||
setting up the environment
|
setting up the environment
|
||||||
running the biberplus labeling script
|
running the biberplus labeling script
|
||||||
|
26024
|
||||||
|
26024
|
||||||
|
id ... http_flag
|
||||||
|
0 56791 ... NaN
|
||||||
|
1 269631 ... NaN
|
||||||
|
2 269628 ... NaN
|
||||||
|
3 269622 ... NaN
|
||||||
|
4 56737 ... NaN
|
||||||
|
... ... ... ...
|
||||||
|
26019 403186 ... True
|
||||||
|
26020 78646 ... True
|
||||||
|
26021 429163 ... True
|
||||||
|
26022 429137 ... True
|
||||||
|
26023 418783 ... True
|
||||||
|
|
||||||
|
[26024 rows x 22 columns]
|
||||||
|
id ... message
|
||||||
|
0 56791 ... pawn character editing\n\nseen on master branc...
|
||||||
|
1 269631 ... Change 86685 merged by jenkins-bot:\nFollow-up...
|
||||||
|
2 269628 ... *** Bug 54785 has been marked as a duplicate o...
|
||||||
|
3 269622 ... Change 86685 had a related patch set uploaded ...
|
||||||
|
4 56737 ... **Author:** `Wikifram`\n\n**Description:**\nAf...
|
||||||
|
... ... ... ...
|
||||||
|
26019 403186 ... Could you attach a screenshot please? Drag & d...
|
||||||
|
26020 78646 ... Hi,\n\nWe have a wiki which has a part which c...
|
||||||
|
26021 429163 ... Sorry for not reply-ing. I did a test and coul...
|
||||||
|
26022 429137 ... @DikkieDick: Please answer.
|
||||||
|
26023 418783 ... I cannot replicate this. What's the name of th...
|
||||||
|
|
||||||
|
[26024 rows x 121 columns]
|
||||||
|
biberplus labeling pau
|
||||||
|
job finished, cleaning up
|
||||||
|
job pau at: Wed Jul 23 14:58:09 CDT 2025
|
||||||
|
@ -39,7 +39,6 @@ def biberplus_labeler(text):
|
|||||||
config = load_config()
|
config = load_config()
|
||||||
config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
|
config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
|
||||||
pipeline = load_pipeline(config)
|
pipeline = load_pipeline(config)
|
||||||
#test = ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis']
|
|
||||||
features_list = []
|
features_list = []
|
||||||
for message in text:
|
for message in text:
|
||||||
message_label = calculate_tag_frequencies(message, pipeline, config)
|
message_label = calculate_tag_frequencies(message, pipeline, config)
|
||||||
@ -48,18 +47,11 @@ def biberplus_labeler(text):
|
|||||||
features_list.append(mean_row)
|
features_list.append(mean_row)
|
||||||
print(len(features_list))
|
print(len(features_list))
|
||||||
frequencies_df = pd.DataFrame(features_list)
|
frequencies_df = pd.DataFrame(features_list)
|
||||||
frequencies_df['comment_text'] = text
|
frequencies_df['message'] = text
|
||||||
frequencies_df = frequencies_df.reset_index(drop=True)
|
frequencies_df = frequencies_df.reset_index(drop=True)
|
||||||
return frequencies_df
|
return frequencies_df
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
#https://huggingface.co/Blablablab/neurobiber
|
|
||||||
'''
|
|
||||||
docs = [
|
|
||||||
"First text goes here.",
|
|
||||||
"Second text, slightly different style."
|
|
||||||
]
|
|
||||||
'''
|
|
||||||
#loading in the discussion data from the universal CSV
|
#loading in the discussion data from the universal CSV
|
||||||
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
||||||
#formatting for the neurobiber model
|
#formatting for the neurobiber model
|
||||||
@ -81,15 +73,18 @@ if __name__ == "__main__":
|
|||||||
how='inner'
|
how='inner'
|
||||||
)
|
)
|
||||||
'''
|
'''
|
||||||
print(len(final_discussion_df))
|
print(first_discussion_df)
|
||||||
|
print(final_discussion_df)
|
||||||
#final_discussion_df["biberplus_preds"] = list(preds)
|
#final_discussion_df["biberplus_preds"] = list(preds)
|
||||||
#assert that order has been preserved
|
#assert that order has been preserved
|
||||||
for _ in range(10):
|
for _ in range(1000):
|
||||||
random_index = random.choice(first_discussion_df.index)
|
random_index = random.randrange(len(final_discussion_df))
|
||||||
assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
|
assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
|
||||||
|
#assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
|
||||||
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
|
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
|
||||||
assert len(first_discussion_df) == len(final_discussion_df)
|
assert len(first_discussion_df) == len(final_discussion_df)
|
||||||
|
final_discussion_df = final_discussion_df.drop(columns=["message"])
|
||||||
# if passing the prior asserts, let's write to a csv
|
# if passing the prior asserts, let's write to a csv
|
||||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False)
|
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False)
|
||||||
print('biberplus labeling pau')
|
print('biberplus labeling pau')
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user