adding biberplus labels
This commit is contained in:
		
							parent
							
								
									edd17d3269
								
							
						
					
					
						commit
						b0584ec1be
					
				
							
								
								
									
										151688
									
								
								p2/quest/072325_biberplus_labels.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151688
									
								
								p2/quest/072325_biberplus_labels.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @ -1,3 +1,36 @@ | |||||||
| starting the job at: Tue Jul 22 16:43:27 CDT 2025 | starting the job at: Wed Jul 23 14:49:04 CDT 2025 | ||||||
| setting up the environment | setting up the environment | ||||||
| running the biberplus labeling script | running the biberplus labeling script | ||||||
|  | 26024 | ||||||
|  | 26024 | ||||||
|  |            id  ... http_flag | ||||||
|  | 0       56791  ...       NaN | ||||||
|  | 1      269631  ...       NaN | ||||||
|  | 2      269628  ...       NaN | ||||||
|  | 3      269622  ...       NaN | ||||||
|  | 4       56737  ...       NaN | ||||||
|  | ...       ...  ...       ... | ||||||
|  | 26019  403186  ...      True | ||||||
|  | 26020   78646  ...      True | ||||||
|  | 26021  429163  ...      True | ||||||
|  | 26022  429137  ...      True | ||||||
|  | 26023  418783  ...      True | ||||||
|  | 
 | ||||||
|  | [26024 rows x 22 columns] | ||||||
|  |            id  ...                                            message | ||||||
|  | 0       56791  ...  pawn character editing\n\nseen on master branc... | ||||||
|  | 1      269631  ...  Change 86685 merged by jenkins-bot:\nFollow-up... | ||||||
|  | 2      269628  ...  *** Bug 54785 has been marked as a duplicate o... | ||||||
|  | 3      269622  ...  Change 86685 had a related patch set uploaded ... | ||||||
|  | 4       56737  ...  **Author:** `Wikifram`\n\n**Description:**\nAf... | ||||||
|  | ...       ...  ...                                                ... | ||||||
|  | 26019  403186  ...  Could you attach a screenshot please? Drag & d... | ||||||
|  | 26020   78646  ...  Hi,\n\nWe have a wiki which has a part which c... | ||||||
|  | 26021  429163  ...  Sorry for not reply-ing. I did a test and coul... | ||||||
|  | 26022  429137  ...                        @DikkieDick: Please answer. | ||||||
|  | 26023  418783  ...  I cannot replicate this. What's the name of th... | ||||||
|  | 
 | ||||||
|  | [26024 rows x 121 columns] | ||||||
|  | biberplus labeling pau | ||||||
|  | job finished, cleaning up | ||||||
|  | job pau at: Wed Jul 23 14:58:09 CDT 2025 | ||||||
|  | |||||||
| @ -39,7 +39,6 @@ def biberplus_labeler(text): | |||||||
|     config = load_config() |     config = load_config() | ||||||
|     config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100}) |     config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100}) | ||||||
|     pipeline = load_pipeline(config) |     pipeline = load_pipeline(config) | ||||||
|     #test =  ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis'] |  | ||||||
|     features_list = [] |     features_list = [] | ||||||
|     for message in text: |     for message in text: | ||||||
|         message_label = calculate_tag_frequencies(message, pipeline, config) |         message_label = calculate_tag_frequencies(message, pipeline, config) | ||||||
| @ -48,18 +47,11 @@ def biberplus_labeler(text): | |||||||
|         features_list.append(mean_row) |         features_list.append(mean_row) | ||||||
|     print(len(features_list)) |     print(len(features_list)) | ||||||
|     frequencies_df = pd.DataFrame(features_list) |     frequencies_df = pd.DataFrame(features_list) | ||||||
|     frequencies_df['comment_text'] = text |     frequencies_df['message'] = text | ||||||
|     frequencies_df = frequencies_df.reset_index(drop=True) |     frequencies_df = frequencies_df.reset_index(drop=True) | ||||||
|     return frequencies_df |     return frequencies_df | ||||||
|   |   | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     #https://huggingface.co/Blablablab/neurobiber |  | ||||||
|     ''' |  | ||||||
|     docs = [ |  | ||||||
|     "First text goes here.", |  | ||||||
|     "Second text, slightly different style." |  | ||||||
|     ] |  | ||||||
|     ''' |  | ||||||
|     #loading in the discussion data from the universal CSV |     #loading in the discussion data from the universal CSV | ||||||
|     first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") |     first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") | ||||||
|     #formatting for the neurobiber model |     #formatting for the neurobiber model | ||||||
| @ -81,15 +73,18 @@ if __name__ == "__main__": | |||||||
|         how='inner' |         how='inner' | ||||||
|     ) |     ) | ||||||
|     ''' |     ''' | ||||||
|     print(len(final_discussion_df)) |     print(first_discussion_df) | ||||||
|  |     print(final_discussion_df) | ||||||
|     #final_discussion_df["biberplus_preds"] = list(preds) |     #final_discussion_df["biberplus_preds"] = list(preds) | ||||||
|     #assert that order has been preserved  |     #assert that order has been preserved  | ||||||
|     for _ in range(10): |     for _ in range(1000): | ||||||
|         random_index = random.choice(first_discussion_df.index) |         random_index = random.randrange(len(final_discussion_df)) | ||||||
|         assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] |         assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"] | ||||||
|  |         #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] | ||||||
|     #assert that there are the same number of rows in first_discussion_df and second_discussion_df |     #assert that there are the same number of rows in first_discussion_df and second_discussion_df | ||||||
|     assert len(first_discussion_df) == len(final_discussion_df) |     assert len(first_discussion_df) == len(final_discussion_df) | ||||||
|  |     final_discussion_df = final_discussion_df.drop(columns=["message"]) | ||||||
|     # if passing the prior asserts, let's write to a csv |     # if passing the prior asserts, let's write to a csv | ||||||
|     final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False) |     final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False) | ||||||
|     print('biberplus labeling pau') |     print('biberplus labeling pau') | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user