ae870fed0b 
							
						 
					 
					
						
						
							
							parquet path is code-complete  
						
						
						
					 
					
						2021-10-17 21:46:31 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							26f6d8f984 
							
						 
					 
					
						
						
							
							remove dependency on pandas.  
						
						
						
					 
					
						2021-10-17 20:24:33 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							ae9a241747 
							
						 
					 
					
						
						
							
							use dataclasses and pyarrow for types.  
						
						
						
					 
					
						2021-10-17 20:21:22 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							d8d20f670b 
							
						 
					 
					
						
						
							
							initial work on parquet support  
						
						
						
					 
					
						2021-10-17 13:22:22 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							cdfa77d66d 
							
						 
					 
					
						
						
							
							remove commented code  
						
						
						
					 
					
						2019-11-11 11:28:48 -08:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							02b3250a36 
							
						 
					 
					
						
						
							
							refactor regex matching in a tidier object oriented style  
						
						
						
					 
					
						2019-11-09 13:07:46 -08:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							414cc5ff2d 
							
						 
					 
					
						
						
							
							validate tests and add asserts and baselines for regex tests.  
						
						
						
					 
					
						2019-11-09 12:19:55 -08:00 
						 
				 
			
				
					
						
							
							
								sohyeonhwang 
							
						 
					 
					
						
						
						
						
							
						
						
							4ccde84529 
							
						 
					 
					
						
						
							
							added regex scanner v2's dump unit test file regextest.xml.bz2  
						
						
						
					 
					
						2019-11-07 14:06:15 -06:00 
						 
				 
			
				
					
						
							
							
								sohyeonhwang 
							
						 
					 
					
						
						
						
						
							
						
						
							f147e1d899 
							
						 
					 
					
						
						
							
							merging pull containing revert-radius with 2nd version of regex scanner w/ unit tests  
						
						
						
					 
					
						2019-11-07 13:28:17 -06:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							c84844cfb5 
							
						 
					 
					
						
						
							
							add unit tests for configuring revert_radius  
						
						
						
					 
					
						2019-10-07 15:02:30 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							c4416d0f1b 
							
						 
					 
					
						
						
							
							make revert radius configurable  
						
						
						
					 
					
						2019-10-07 13:57:49 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							7b856bec86 
							
						 
					 
					
						
						
							
							Merge branch 'master' into regex_scanner  
						
						
						
					 
					
						2019-10-05 18:17:03 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							324ccc8e26 
							
						 
					 
					
						
						
							
							update baseline outputs  
						
						
						
					 
					
						2019-10-05 16:36:07 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							17529cdd48 
							
						 
					 
					
						
						
							
							bugfix, remove old legacy persistence flag  
						
						
						
					 
					
						2019-10-05 16:13:11 -07:00 
						 
				 
			
				
					
						
							
							
								sohyeonhwang 
							
						 
					 
					
						
						
						
						
							
						
						
							7bf4559ceb 
							
						 
					 
					
						
						
							
							changes for regex scanner addition  
						
						
						
					 
					
						2019-10-05 15:36:58 -05:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							fb052ffa33 
							
						 
					 
					
						
						
							
							edont compute persistence by default  
						
						
						
					 
					
						2019-09-22 15:54:17 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							e871023ff5 
							
						 
					 
					
						
						
							
							elaborate docstring for persistence  
						
						
						
					 
					
						2019-09-22 15:11:59 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							7d62ff9fb7 
							
						 
					 
					
						
						
							
							improve help for namespace-include  
						
						
						
					 
					
						2018-09-03 11:30:12 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							f7f5bf8fd4 
							
						 
					 
					
						
						
							
							sub assertEquals assertEqual  
						
						
						
					 
					
						2018-09-03 11:21:49 -07:00 
						 
				 
			
				
					
						
							
							
								Nate E TeBlunthuis 
							
						 
					 
					
						
						
						
						
							
						
						
							f784c77f60 
							
						 
					 
					
						
						
							
							add namespace filter parameter  
						
						
						
					 
					
						2018-09-03 11:13:48 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							317bafb50d 
							
						 
					 
					
						
						
							
							Merge branch 'advanced_persistence' of code.communitydata.cc:mediawiki_dump_tools into advanced_persistence  
						
						
						
					 
					
						2018-08-23 19:00:49 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							7cd0bf3b9e 
							
						 
					 
					
						
						
							
							Add parameter for selecting specific namespaces.  
						
						
						
					 
					
						2018-08-23 18:49:32 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							d93769c21f 
							
						 
					 
					
						
						
							
							Merge branch 'advanced_persistence' of code.communitydata.cc:mediawiki_dump_tools into advanced_persistence  
						
						
						
					 
					
						2018-08-23 18:27:09 -07:00 
						 
				 
			
				
					
						
							
							
								Nate E TeBlunthuis 
							
						 
					 
					
						
						
						
						
							
						
						
							afd40c1a45 
							
						 
					 
					
						
						
							
							Merge branch 'advanced_persistence' of code.communitydata.cc:/mediawiki_dump_tools into advanced_persistence  
						
						
						
					 
					
						2018-08-23 18:25:51 -07:00 
						 
				 
			
				
					
						
							
							
								Nate E TeBlunthuis 
							
						 
					 
					
						
						
						
						
							
						
						
							e4222c45dd 
							
						 
					 
					
						
						
							
							add namespace filter parameter  
						
						
						
					 
					
						2018-08-23 18:25:08 -07:00 
						 
				 
			
				
					
						
							
							
								Nate E TeBlunthuis 
							
						 
					 
					
						
						
						
						
							
						
						
							829ffcffae 
							
						 
					 
					
						
						
							
							Merge branch 'advanced_persistence' of code.communitydata.cc:/mediawiki_dump_tools into advanced_persistence  
						
						
						
					 
					
						2018-08-23 18:23:36 -07:00 
						 
				 
			
				
					
						
							
							
								Nate E TeBlunthuis 
							
						 
					 
					
						
						
						
						
							
						
						
							776b73519a 
							
						 
					 
					
						
						
							
							add namespace filter parameter  
						
						
						
					 
					
						2018-08-23 18:23:23 -07:00 
						 
				 
			
				
					
						
							
							
								Nate E TeBlunthuis 
							
						 
					 
					
						
						
						
						
							
						
						
							5b6aaad862 
							
						 
					 
					
						
						
							
							add namespace filter parameter  
						
						
						
					 
					
						2018-08-23 18:02:56 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							f468d1a5b6 
							
						 
					 
					
						
						
							
							add support for persistence with segment matching  
						
						
						
					 
					
						2018-08-20 16:08:16 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							bf396ad366 
							
						 
					 
					
						
						
							
							Prefix page titles with namespace names.  
						
						
						
					 
					
						2018-07-09 22:11:17 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							dba793c6ac 
							
						 
					 
					
						
						
							
							migrate to mwxml. This completes the migration away from python-mediawiki-utilities. Except for preserving legacy persistence behavior, we can safely use the nice updates from the mediawiki-utils project.  
						
						
						
					 
					
						2018-07-05 01:16:00 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							d77b0a4965 
							
						 
					 
					
						
						
							
							migrate to mwpersistence. this fixes many issues. We preserve legacy persistence behavior using the --persistence-legacy.  
						
						
						
					 
					
						2018-07-04 19:06:07 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							7db6288923 
							
						 
					 
					
						
						
							
							migrate reverts to python-mwreverts  
						
						
						
					 
					
						2018-07-04 15:29:48 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							a883cb536b 
							
						 
					 
					
						
						
							
							add note to readme about dependency on compression software  
						
						
						
					 
					
						2018-07-04 15:20:52 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							e925ac9da1 
							
						 
					 
					
						
						
							
							add tests for wikipedia, malformed xml, bzip2, correct bz2 bug in wikiq.  
						
						
						
					 
					
						2018-07-04 15:08:30 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							d2746879d0 
							
						 
					 
					
						
						
							
							create baseline tests for xml dump processing  
						
						
						
					 
					
						2018-07-03 23:43:47 -07:00 
						 
				 
			
				
					
						
							
							
								Benjamin Mako Hill 
							
						 
					 
					
						
						
						
						
							
						
						
							ba886ecf4c 
							
						 
					 
					
						
						
							
							a number of small updates and fixes  
						
						... 
						
						
						
						- fix regex for filename/filetype matches
- unload all files not just ones with end with xml in 7z archives
- fix bug that broke stdout
- minor cosmetic fixes
- updated mediawiki-utilities submodule to latest version 
						
					 
					
						2018-05-17 14:37:20 -07:00 
						 
				 
			
				
					
						
					 
					
						
						
						
						
							
						
						
							3f9da40747 
							
						 
					 
					
						
						
							
							support 7z archives with multiple files. add urlencode paraeter  
						
						
						
					 
					
						2017-12-07 15:10:56 -08:00 
						 
				 
			
				
					
						
							
							
								Benjamin Mako Hill 
							
						 
					 
					
						
						
						
						
							
						
						
							5d7dceb9e4 
							
						 
					 
					
						
						
							
							fix code to work with bzip files  
						
						
						
					 
					
						2017-02-06 18:25:17 -08:00 
						 
				 
			
				
					
						
							
							
								Benjamin Mako Hill 
							
						 
					 
					
						
						
						
						
							
						
						
							7d8ec932dd 
							
						 
					 
					
						
						
							
							added list of compressed dump files to .gitignore  
						
						
						
					 
					
						2015-07-23 12:16:31 -07:00 
						 
				 
			
				
					
						
							
							
								Benjamin Mako Hill 
							
						 
					 
					
						
						
						
						
							
						
						
							d934700ee9 
							
						 
					 
					
						
						
							
							added support to parse namespaces from title  
						
						... 
						
						
						
						This is necessary for wikis (e.g., Wikia XML dumps) that do not include
namespace metadata as tags within each <page>. 
						
					 
					
						2015-07-23 12:12:20 -07:00 
						 
				 
			
				
					
						
							
							
								Benjamin Mako Hill 
							
						 
					 
					
						
						
						
						
							
						
						
							108c8442b2 
							
						 
					 
					
						
						
							
							added README file to document the submodule  
						
						
						
					 
					
						2015-07-22 19:55:08 -07:00 
						 
				 
			
				
					
						
							
							
								Benjamin Mako Hill 
							
						 
					 
					
						
						
						
						
							
						
						
							eeb0742cc6 
							
						 
					 
					
						
						
							
							created new repository for wikiq with Mediawiki-Utilities as a submodule  
						
						
						
					 
					
						2015-07-22 19:44:52 -07:00