refactor into src-layout package.
This commit is contained in:
		
							parent
							
								
									56c90fe1cc
								
							
						
					
					
						commit
						c597a6b7f4
					
				
							
								
								
									
										42
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,42 @@ | |||||||
|  | When you install this from git, you will need to first clone the | ||||||
|  | repository: | ||||||
|  | 
 | ||||||
|  |     git clone git://projects.mako.cc/mediawiki_dump_tools | ||||||
|  | 
 | ||||||
|  | From within the repository working directory, initiatlize and set up the | ||||||
|  | submodule like: | ||||||
|  | 
 | ||||||
|  |     git submodule init | ||||||
|  |     git submodule update | ||||||
|  | 
 | ||||||
|  | Wikimedia dumps are usually in a compressed format such as 7z (most | ||||||
|  | common), gz, or bz2. Wikiq uses your computer\'s compression software to | ||||||
|  | read these files. Therefore wikiq depends on [7za]{.title-ref}, | ||||||
|  | [gzcat]{.title-ref}, and [zcat]{.title-ref}. | ||||||
|  | 
 | ||||||
|  | # Dependencies | ||||||
|  | 
 | ||||||
|  | These non-Python dependencies must be installed on your system for wikiq | ||||||
|  | and its associated tests to work. | ||||||
|  | 
 | ||||||
|  | -   7zip | ||||||
|  | -   ffmpeg | ||||||
|  | 
 | ||||||
|  | A new diff engine based on [\_wikidiff2]{.title-ref} can be used for | ||||||
|  | word-persistence. Wikiq can also output the diffs between each page | ||||||
|  | revision. This requires installing Wikidiff 2 on your system. On Debian | ||||||
|  | or Ubuntu Linux this can be done via. | ||||||
|  | 
 | ||||||
|  | `apt-get install php-wikidiff2` | ||||||
|  | 
 | ||||||
|  | You may have to also run. `sudo phpenmod wikidiff2`. | ||||||
|  | 
 | ||||||
|  | Tests \-\-\--To run tests: | ||||||
|  | 
 | ||||||
|  |     python -m unittest test.Wikiq_Unit_Test | ||||||
|  | 
 | ||||||
|  | ## TODO: | ||||||
|  | 
 | ||||||
|  | 1.  \[\] Output metadata about the run. What parameters were used? What | ||||||
|  |     versions of deltas? | ||||||
|  | 2.  \[\] Url encoding by default | ||||||
							
								
								
									
										12
									
								
								php.ini
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								php.ini
									
									
									
									
									
								
							| @ -1,12 +0,0 @@ | |||||||
| ; wikidiff2 extension settings |  | ||||||
| wikidiff2.change_threshold = 0.2 |  | ||||||
| wikidiff2.moved_line_threshold = 0.4 |  | ||||||
| wikidiff2.moved_paragraph_detection_cutoff = 5000000 |  | ||||||
| wikidiff2.max_word_level_diff_complexity = 40000000 |  | ||||||
| wikidiff2.max_split_size = 1 |  | ||||||
| wikidiff2.initial_split_threshold = 0.1 |  | ||||||
| wikidiff2.final_split_threshold = 0.6 |  | ||||||
| 
 |  | ||||||
| ; It is possible this limit will need to be larger for some pages. |  | ||||||
| post_max_size = 10000M |  | ||||||
| opcache.enable=0 |  | ||||||
| @ -17,7 +17,18 @@ dependencies = [ | |||||||
|     "yamlconf>=0.2.6", |     "yamlconf>=0.2.6", | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [project.scripts] | ||||||
|  | wikiq = "wikiq:main" | ||||||
|  | 
 | ||||||
|  | [build-system] | ||||||
|  | requires = ["hatchling"] | ||||||
|  | build-backend = "hatchling.build" | ||||||
|  | 
 | ||||||
|  | [tool.hatch.build.targets.wheel] | ||||||
|  | packages = ["src/wikiq"] | ||||||
|  | 
 | ||||||
| [tool.uv.sources] | [tool.uv.sources] | ||||||
|  | 
 | ||||||
| yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | ||||||
| mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | ||||||
| deltas = { git = "https://github.com/groceryheist/deltas" } | deltas = { git = "https://github.com/groceryheist/deltas" } | ||||||
|  | |||||||
| @ -23,9 +23,9 @@ from deltas.tokenizers import wikitext_split | |||||||
| import mwpersistence | import mwpersistence | ||||||
| import mwreverts | import mwreverts | ||||||
| 
 | 
 | ||||||
| import tables | import wikiq.tables as tables | ||||||
| from tables import RevisionTable | from wikiq.tables import RevisionTable | ||||||
| from wiki_diff_matcher import WikiDiffMatcher | from wikiq.wiki_diff_matcher import WikiDiffMatcher | ||||||
| 
 | 
 | ||||||
| TO_ENCODE = ('title', 'editor') | TO_ENCODE = ('title', 'editor') | ||||||
| PERSISTENCE_RADIUS = 7 | PERSISTENCE_RADIUS = 7 | ||||||
| @ -352,7 +352,7 @@ class WikiqParser: | |||||||
|         schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) |         schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) | ||||||
| 
 | 
 | ||||||
|         if self.diff: |         if self.diff: | ||||||
|             from diff_pyarrow_schema import diff_field |             from wikiq.diff_pyarrow_schema import diff_field | ||||||
|             schema = schema.append(diff_field) |             schema = schema.append(diff_field) | ||||||
| 
 | 
 | ||||||
|         # Add regex fields to the schema. |         # Add regex fields to the schema. | ||||||
| @ -12,7 +12,7 @@ from pandas.testing import assert_frame_equal, assert_series_equal | |||||||
| 
 | 
 | ||||||
| # Make references to files and wikiq relative to this file, not to the current working directory. | # Make references to files and wikiq relative to this file, not to the current working directory. | ||||||
| TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) | TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) | ||||||
| WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") | WIKIQ: Final[str] = os.path.join(os.path.join(TEST_DIR,".."), "src/wikiq/__init__.py") | ||||||
| TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") | TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") | ||||||
| BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") | BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -6,7 +6,7 @@ import pytest_asyncio | |||||||
| from typing import List | from typing import List | ||||||
| from deltas import Delete, Equal, Insert, wikitext_split | from deltas import Delete, Equal, Insert, wikitext_split | ||||||
| from mwpersistence import Token | from mwpersistence import Token | ||||||
| from wiki_diff_matcher import WikiDiffMatcher | from wikiq.wiki_diff_matcher import WikiDiffMatcher | ||||||
| 
 | 
 | ||||||
| def _replace_whitespace(match): | def _replace_whitespace(match): | ||||||
|     if match.group(1):  # If spaces matched (e.g., '  ') |     if match.group(1):  # If spaces matched (e.g., '  ') | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user