Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							9a852b9300
							
						
					 | 
					
						
						
							
							was renamed to 'term_frequencies' prior to merge.
						
						
						
						
						
					 | 
					
						2024-12-12 07:54:28 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							3d192ab82f
							
						
					 | 
					
						
						
							
							Merge remote-tracking branch 'origin/icwsm_dataverse'
						
						
						
						
						
					 | 
					
						2024-12-12 07:45:06 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							f38ec6c129
							
						
					 | 
					
						
						
							
							smaller outchunk size.
						
						
						
						
						
					 | 
					
						2024-12-07 13:23:44 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							25bfc57baf
							
						
					 | 
					
						
						
							
							change path
						
						
						
						
						
					 | 
					
						2024-12-06 08:18:20 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							c3d2834110
							
						
					 | 
					
						
						
							
							use pyarrow instead of spark to write data
						
						
						
						
						
					 | 
					
						2024-12-06 08:09:02 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							8224195432
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-12-05 11:08:18 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							5d70d3eb6d
							
						
					 | 
					
						
						
							
							improve spark configuration.
						
						
						
						
						
					 | 
					
						2024-12-04 10:43:13 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							89d03dd956
							
						
					 | 
					
						
						
							
							consistent naming and bugfix.
						
						
						
						
						
					 | 
					
						2024-12-04 09:24:45 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							472849ebd9
							
						
					 | 
					
						
						
							
							correct output path.
						
						
						
						
						
					 | 
					
						2024-12-04 09:07:10 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							85945eae90
							
						
					 | 
					
						
						
							
							correct paths.
						
						
						
						
						
					 | 
					
						2024-12-04 09:06:02 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							1cca01fb69
							
						
					 | 
					
						
						
							
							use Path to make directories not os.
						
						
						
						
						
					 | 
					
						2024-12-04 07:47:47 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							39c0fa7a29
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-12-03 19:18:38 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							0436450ea8
							
						
					 | 
					
						
						
							
							typo fix
						
						
						
						
						
					 | 
					
						2024-12-03 19:16:49 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							4be8bb6bf5
							
						
					 | 
					
						
						
							
							bugfix
						
						
						
						
						
					 | 
					
						2024-12-03 19:15:07 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							ec5859c311
							
						
					 | 
					
						
						
							
							pass ngram_output through.
						
						
						
						
						
					 | 
					
						2024-12-03 19:05:44 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							a179d608eb
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-12-03 19:02:26 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							73dd2a96a6
							
						
					 | 
					
						
						
							
							it's selftext not body
						
						
						
						
						
					 | 
					
						2024-12-03 18:59:27 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							5045d6052e
							
						
					 | 
					
						
						
							
							use post title and body in terms
						
						
						
						
						
					 | 
					
						2024-12-03 18:53:41 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							5a131053af
							
						
					 | 
					
						
						
							
							spark config tweaks.
						
						
						
						
						
					 | 
					
						2024-12-01 15:41:47 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							224fb89317
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-12-01 15:28:25 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							b25c332cea
							
						
					 | 
					
						
						
							
							typo fix.
						
						
						
						
						
					 | 
					
						2024-12-01 15:27:16 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							613059737a
							
						
					 | 
					
						
						
							
							set os environment for big machine
						
						
						
						
						
					 | 
					
						2024-12-01 15:25:18 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							abe217d2d5
							
						
					 | 
					
						
						
							
							fix configuration code
						
						
						
						
						
					 | 
					
						2024-12-01 15:21:51 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							9911f758f9
							
						
					 | 
					
						
						
							
							set memory usage.
						
						
						
						
						
					 | 
					
						2024-12-01 14:55:38 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							a31d8b26eb
							
						
					 | 
					
						
						
							
							correct tf_name
						
						
						
						
						
					 | 
					
						2024-12-01 14:38:48 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							e40cc45d40
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-12-01 14:10:47 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							d61746c9f7
							
						
					 | 
					
						
						
							
							make the output authors path.
						
						
						
						
						
					 | 
					
						2024-12-01 13:58:13 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							9df9a8b8ff
							
						
					 | 
					
						
						
							
							rename function.
						
						
						
						
						
					 | 
					
						2024-12-01 13:44:19 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							3fea1f9388
							
						
					 | 
					
						
						
							
							sort and partition the term frequencies using spark.
						
						
						
						
						
					 | 
					
						2024-12-01 13:42:13 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							2b023fea8d
							
						
					 | 
					
						
						
							
							bugfix
						
						
						
						
						
					 | 
					
						2024-12-01 09:58:09 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							88fca0f82b
							
						
					 | 
					
						
						
							
							allow posts schemas to be nullable.
						
						
						
						
						
					 | 
					
						2024-12-01 09:55:12 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							271cbea7d9
							
						
					 | 
					
						
						
							
							add a 'limit' parameter for testing.
						
						
						
						
						
					 | 
					
						2024-12-01 09:51:49 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							4218bf864b
							
						
					 | 
					
						
						
							
							debugging.
						
						
						
						
						
					 | 
					
						2024-12-01 09:39:50 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							22d6a6961c
							
						
					 | 
					
						
						
							
							allow authors to be null in submissions.
						
						
						
						
						
					 | 
					
						2024-11-27 20:04:05 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							a5ca25dd6e
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-11-27 19:56:06 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							2e5181602b
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-11-27 19:53:04 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							0d7f4d3cec
							
						
					 | 
					
						
						
							
							pass through stopWords.
						
						
						
						
						
					 | 
					
						2024-11-27 19:33:28 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							5d48c0eb55
							
						
					 | 
					
						
						
							
							pass through mwe_tokenize
						
						
						
						
						
					 | 
					
						2024-11-27 19:31:59 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							91cc1edf02
							
						
					 | 
					
						
						
							
							pass through mwe_pass
						
						
						
						
						
					 | 
					
						2024-11-27 19:20:49 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							2decdc9750
							
						
					 | 
					
						
						
							
							move function to outer scope.
						
						
						
						
						
					 | 
					
						2024-11-27 19:13:49 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							7da046735b
							
						
					 | 
					
						
						
							
							move function to outer scope.
						
						
						
						
						
					 | 
					
						2024-11-27 19:10:34 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							0631256956
							
						
					 | 
					
						
						
							
							make the output directory.
						
						
						
						
						
					 | 
					
						2024-11-27 19:06:24 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							8cb9683bc2
							
						
					 | 
					
						
						
							
							bugfix
						
						
						
						
						
					 | 
					
						2024-11-27 19:03:52 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							587e1c0022
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-11-27 18:56:22 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							78eb16f4d6
							
						
					 | 
					
						
						
							
							more path munging.
						
						
						
						
						
					 | 
					
						2024-11-27 18:53:16 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							a0a6a08bf2
							
						
					 | 
					
						
						
							
							handle case where we're in a parent directory.
						
						
						
						
						
					 | 
					
						2024-11-27 18:49:03 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							a84b633641
							
						
					 | 
					
						
						
							
							add absolute path to call.
						
						
						
						
						
					 | 
					
						2024-11-27 18:42:29 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							ce7b5f92eb
							
						
					 | 
					
						
						
							
							bugfix.
						
						
						
						
						
					 | 
					
						2024-11-27 17:20:04 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							fbf905c740
							
						
					 | 
					
						
						
							
							rename file
						
						
						
						
						
					 | 
					
						2024-11-27 11:55:31 -08:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Nathan TeBlunthuis
							
						 
					 | 
					
						
						
						
						
							
						
						
							dd894ebf61
							
						
					 | 
					
						
						
							
							support posts in ngrams
						
						
						
						
						
					 | 
					
						2024-11-27 11:51:22 -08:00 | 
					
					
						
						
							
							
							
						
					 |