adding biberplus labels
This commit is contained in:
		
							parent
							
								
									edd17d3269
								
							
						
					
					
						commit
						b0584ec1be
					
				
							
								
								
									
										151688
									
								
								p2/quest/072325_biberplus_labels.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151688
									
								
								p2/quest/072325_biberplus_labels.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @ -1,3 +1,36 @@ | ||||
| starting the job at: Tue Jul 22 16:43:27 CDT 2025 | ||||
| starting the job at: Wed Jul 23 14:49:04 CDT 2025 | ||||
| setting up the environment | ||||
| running the biberplus labeling script | ||||
| 26024 | ||||
| 26024 | ||||
|            id  ... http_flag | ||||
| 0       56791  ...       NaN | ||||
| 1      269631  ...       NaN | ||||
| 2      269628  ...       NaN | ||||
| 3      269622  ...       NaN | ||||
| 4       56737  ...       NaN | ||||
| ...       ...  ...       ... | ||||
| 26019  403186  ...      True | ||||
| 26020   78646  ...      True | ||||
| 26021  429163  ...      True | ||||
| 26022  429137  ...      True | ||||
| 26023  418783  ...      True | ||||
| 
 | ||||
| [26024 rows x 22 columns] | ||||
|            id  ...                                            message | ||||
| 0       56791  ...  pawn character editing\n\nseen on master branc... | ||||
| 1      269631  ...  Change 86685 merged by jenkins-bot:\nFollow-up... | ||||
| 2      269628  ...  *** Bug 54785 has been marked as a duplicate o... | ||||
| 3      269622  ...  Change 86685 had a related patch set uploaded ... | ||||
| 4       56737  ...  **Author:** `Wikifram`\n\n**Description:**\nAf... | ||||
| ...       ...  ...                                                ... | ||||
| 26019  403186  ...  Could you attach a screenshot please? Drag & d... | ||||
| 26020   78646  ...  Hi,\n\nWe have a wiki which has a part which c... | ||||
| 26021  429163  ...  Sorry for not reply-ing. I did a test and coul... | ||||
| 26022  429137  ...                        @DikkieDick: Please answer. | ||||
| 26023  418783  ...  I cannot replicate this. What's the name of th... | ||||
| 
 | ||||
| [26024 rows x 121 columns] | ||||
| biberplus labeling pau | ||||
| job finished, cleaning up | ||||
| job pau at: Wed Jul 23 14:58:09 CDT 2025 | ||||
|  | ||||
| @ -39,7 +39,6 @@ def biberplus_labeler(text): | ||||
|     config = load_config() | ||||
|     config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100}) | ||||
|     pipeline = load_pipeline(config) | ||||
|     #test =  ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis'] | ||||
|     features_list = [] | ||||
|     for message in text: | ||||
|         message_label = calculate_tag_frequencies(message, pipeline, config) | ||||
| @ -48,18 +47,11 @@ def biberplus_labeler(text): | ||||
|         features_list.append(mean_row) | ||||
|     print(len(features_list)) | ||||
|     frequencies_df = pd.DataFrame(features_list) | ||||
|     frequencies_df['comment_text'] = text | ||||
|     frequencies_df['message'] = text | ||||
|     frequencies_df = frequencies_df.reset_index(drop=True) | ||||
|     return frequencies_df | ||||
|   | ||||
| if __name__ == "__main__": | ||||
|     #https://huggingface.co/Blablablab/neurobiber | ||||
|     ''' | ||||
|     docs = [ | ||||
|     "First text goes here.", | ||||
|     "Second text, slightly different style." | ||||
|     ] | ||||
|     ''' | ||||
|     #loading in the discussion data from the universal CSV | ||||
|     first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") | ||||
|     #formatting for the neurobiber model | ||||
| @ -81,15 +73,18 @@ if __name__ == "__main__": | ||||
|         how='inner' | ||||
|     ) | ||||
|     ''' | ||||
|     print(len(final_discussion_df)) | ||||
|     print(first_discussion_df) | ||||
|     print(final_discussion_df) | ||||
|     #final_discussion_df["biberplus_preds"] = list(preds) | ||||
|     #assert that order has been preserved  | ||||
|     for _ in range(10): | ||||
|         random_index = random.choice(first_discussion_df.index) | ||||
|         assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] | ||||
|     for _ in range(1000): | ||||
|         random_index = random.randrange(len(final_discussion_df)) | ||||
|         assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"] | ||||
|         #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] | ||||
|     #assert that there are the same number of rows in first_discussion_df and second_discussion_df | ||||
|     assert len(first_discussion_df) == len(final_discussion_df) | ||||
|     final_discussion_df = final_discussion_df.drop(columns=["message"]) | ||||
|     # if passing the prior asserts, let's write to a csv | ||||
|     final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False) | ||||
|     final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False) | ||||
|     print('biberplus labeling pau') | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user