refactor into src-layout package.
This commit is contained in:
		
							parent
							
								
									56c90fe1cc
								
							
						
					
					
						commit
						c597a6b7f4
					
				
							
								
								
									
										42
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,42 @@ | ||||
| When you install this from git, you will need to first clone the | ||||
| repository: | ||||
| 
 | ||||
|     git clone git://projects.mako.cc/mediawiki_dump_tools | ||||
| 
 | ||||
| From within the repository working directory, initiatlize and set up the | ||||
| submodule like: | ||||
| 
 | ||||
|     git submodule init | ||||
|     git submodule update | ||||
| 
 | ||||
| Wikimedia dumps are usually in a compressed format such as 7z (most | ||||
| common), gz, or bz2. Wikiq uses your computer\'s compression software to | ||||
| read these files. Therefore wikiq depends on [7za]{.title-ref}, | ||||
| [gzcat]{.title-ref}, and [zcat]{.title-ref}. | ||||
| 
 | ||||
| # Dependencies | ||||
| 
 | ||||
| These non-Python dependencies must be installed on your system for wikiq | ||||
| and its associated tests to work. | ||||
| 
 | ||||
| -   7zip | ||||
| -   ffmpeg | ||||
| 
 | ||||
| A new diff engine based on [\_wikidiff2]{.title-ref} can be used for | ||||
| word-persistence. Wikiq can also output the diffs between each page | ||||
| revision. This requires installing Wikidiff 2 on your system. On Debian | ||||
| or Ubuntu Linux this can be done via. | ||||
| 
 | ||||
| `apt-get install php-wikidiff2` | ||||
| 
 | ||||
| You may have to also run. `sudo phpenmod wikidiff2`. | ||||
| 
 | ||||
| Tests \-\-\--To run tests: | ||||
| 
 | ||||
|     python -m unittest test.Wikiq_Unit_Test | ||||
| 
 | ||||
| ## TODO: | ||||
| 
 | ||||
| 1.  \[\] Output metadata about the run. What parameters were used? What | ||||
|     versions of deltas? | ||||
| 2.  \[\] Url encoding by default | ||||
							
								
								
									
										12
									
								
								php.ini
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								php.ini
									
									
									
									
									
								
							| @ -1,12 +0,0 @@ | ||||
| ; wikidiff2 extension settings | ||||
| wikidiff2.change_threshold = 0.2 | ||||
| wikidiff2.moved_line_threshold = 0.4 | ||||
| wikidiff2.moved_paragraph_detection_cutoff = 5000000 | ||||
| wikidiff2.max_word_level_diff_complexity = 40000000 | ||||
| wikidiff2.max_split_size = 1 | ||||
| wikidiff2.initial_split_threshold = 0.1 | ||||
| wikidiff2.final_split_threshold = 0.6 | ||||
| 
 | ||||
| ; It is possible this limit will need to be larger for some pages. | ||||
| post_max_size = 10000M | ||||
| opcache.enable=0 | ||||
| @ -17,7 +17,18 @@ dependencies = [ | ||||
|     "yamlconf>=0.2.6", | ||||
| ] | ||||
| 
 | ||||
| [project.scripts] | ||||
| wikiq = "wikiq:main" | ||||
| 
 | ||||
| [build-system] | ||||
| requires = ["hatchling"] | ||||
| build-backend = "hatchling.build" | ||||
| 
 | ||||
| [tool.hatch.build.targets.wheel] | ||||
| packages = ["src/wikiq"] | ||||
| 
 | ||||
| [tool.uv.sources] | ||||
| 
 | ||||
| yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | ||||
| mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | ||||
| deltas = { git = "https://github.com/groceryheist/deltas" } | ||||
|  | ||||
| @ -23,9 +23,9 @@ from deltas.tokenizers import wikitext_split | ||||
| import mwpersistence | ||||
| import mwreverts | ||||
| 
 | ||||
| import tables | ||||
| from tables import RevisionTable | ||||
| from wiki_diff_matcher import WikiDiffMatcher | ||||
| import wikiq.tables as tables | ||||
| from wikiq.tables import RevisionTable | ||||
| from wikiq.wiki_diff_matcher import WikiDiffMatcher | ||||
| 
 | ||||
| TO_ENCODE = ('title', 'editor') | ||||
| PERSISTENCE_RADIUS = 7 | ||||
| @ -352,7 +352,7 @@ class WikiqParser: | ||||
|         schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) | ||||
| 
 | ||||
|         if self.diff: | ||||
|             from diff_pyarrow_schema import diff_field | ||||
|             from wikiq.diff_pyarrow_schema import diff_field | ||||
|             schema = schema.append(diff_field) | ||||
| 
 | ||||
|         # Add regex fields to the schema. | ||||
| @ -12,7 +12,7 @@ from pandas.testing import assert_frame_equal, assert_series_equal | ||||
| 
 | ||||
| # Make references to files and wikiq relative to this file, not to the current working directory. | ||||
| TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) | ||||
| WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") | ||||
| WIKIQ: Final[str] = os.path.join(os.path.join(TEST_DIR,".."), "src/wikiq/__init__.py") | ||||
| TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") | ||||
| BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") | ||||
| 
 | ||||
|  | ||||
| @ -6,7 +6,7 @@ import pytest_asyncio | ||||
| from typing import List | ||||
| from deltas import Delete, Equal, Insert, wikitext_split | ||||
| from mwpersistence import Token | ||||
| from wiki_diff_matcher import WikiDiffMatcher | ||||
| from wikiq.wiki_diff_matcher import WikiDiffMatcher | ||||
| 
 | ||||
| def _replace_whitespace(match): | ||||
|     if match.group(1):  # If spaces matched (e.g., '  ') | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user