updating with fit topic model
This commit is contained in:
		
							parent
							
								
									609039e5cc
								
							
						
					
					
						commit
						7141d0d9ad
					
				
							
								
								
									
										
											BIN
										
									
								
								models/062525bertopic
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								models/062525bertopic
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										4
									
								
								models/bertopic_job.log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								models/bertopic_job.log
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,4 @@ | ||||
| setting up the environment by loading in conda environment at Wed Jun 25 12:06:59 CDT 2025 | ||||
| running the bertopic job at Wed Jun 25 12:06:59 CDT 2025 | ||||
| 2025-06-25 12:09:02,952 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same. | ||||
| 350 | ||||
| @ -8,10 +8,14 @@ | ||||
| #SBATCH --mem=64G | ||||
| #SBATCH --cpus-per-task=4 | ||||
| #SBATCH --job-name=SLR_OCR  | ||||
| #SBATCH --output=slr_ocr_job.log | ||||
| #SBATCH --output=bertopic_job.log | ||||
| #SBATCH --mail-type=BEGIN,END,FAIL | ||||
| #SBATCH --mail-user=gaughan@u.northwestern.edu | ||||
| 
 | ||||
| module purge | ||||
| 
 | ||||
| eval "$(conda shell.bash hook)" | ||||
| 
 | ||||
| echo "setting up the environment by loading in conda environment at $(date)" | ||||
| 
 | ||||
| conda activate bertopic-env | ||||
|  | ||||
| @ -1,9 +1,10 @@ | ||||
| from bertopic import BERTopic | ||||
| 
 | ||||
| from nltk.corpus import stopwords | ||||
| import os | ||||
| import re | ||||
| from markdown import markdown | ||||
| from bs4 import BeautifulSoup | ||||
| import string  | ||||
| 
 | ||||
| #function generated by GitHub CoPilot | ||||
| def strip_markdown(md_text): | ||||
| @ -73,11 +74,29 @@ def get_all_md_sections(directory): | ||||
|                 all_sections.extend(clean_sections) | ||||
|     return all_sections | ||||
| 
 | ||||
| #function generated by GitHubCopilot  | ||||
| def clean_text(text, stop_words): | ||||
|     # Remove punctuation | ||||
|     text = text.translate(str.maketrans('', '', string.punctuation)) | ||||
|     # Remove stopwords | ||||
|     words = text.split() | ||||
|     stop_words.add("software") | ||||
|     stop_words.add("project") | ||||
|     words = [word for word in words if word.lower() not in stop_words] | ||||
|     return ' '.join(words) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     directory = "/home/nws8519/git/adaptation-slr/studies/"  | ||||
|     docs = get_all_md_sections(directory) | ||||
|     #cleaning (largely just removing stopwords and punctuation) | ||||
|     #nltk.download('stopwords') | ||||
|     stop_words = set(stopwords.words('english')) | ||||
|     cleaned_docs = [clean_text(d, stop_words) for d in docs] | ||||
|     print(len(cleaned_docs)) | ||||
|     #handing off to topic  | ||||
|     topic_model = BERTopic() | ||||
|     topics, probabilities = topic_model.fit_transform(docs) | ||||
|     topics, probabilities = topic_model.fit_transform(cleaned_docs) | ||||
|     topic_model.get_topic_info() | ||||
|     topic_model.get_document_info(docs) | ||||
|     topic_model.save("/home/nws8519/git/adaptation-slr/models/", serialization="pickle") | ||||
|     topic_model.save("/home/nws8519/git/adaptation-slr/models/062525bertopic", serialization="pickle") | ||||
|  | ||||
							
								
								
									
										33385
									
								
								models/slr_ocr_job.log
									
									
									
									
									
								
							
							
						
						
									
										33385
									
								
								models/slr_ocr_job.log
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Loading…
	
		Reference in New Issue
	
	Block a user