refactor into src-layout package.
This commit is contained in:
parent
56c90fe1cc
commit
c597a6b7f4
42
README.md
Normal file
42
README.md
Normal file
@ -0,0 +1,42 @@
|
||||
When you install this from git, you will need to first clone the
|
||||
repository:
|
||||
|
||||
git clone git://projects.mako.cc/mediawiki_dump_tools
|
||||
|
||||
From within the repository working directory, initiatlize and set up the
|
||||
submodule like:
|
||||
|
||||
git submodule init
|
||||
git submodule update
|
||||
|
||||
Wikimedia dumps are usually in a compressed format such as 7z (most
|
||||
common), gz, or bz2. Wikiq uses your computer\'s compression software to
|
||||
read these files. Therefore wikiq depends on [7za]{.title-ref},
|
||||
[gzcat]{.title-ref}, and [zcat]{.title-ref}.
|
||||
|
||||
# Dependencies
|
||||
|
||||
These non-Python dependencies must be installed on your system for wikiq
|
||||
and its associated tests to work.
|
||||
|
||||
- 7zip
|
||||
- ffmpeg
|
||||
|
||||
A new diff engine based on [\_wikidiff2]{.title-ref} can be used for
|
||||
word-persistence. Wikiq can also output the diffs between each page
|
||||
revision. This requires installing Wikidiff 2 on your system. On Debian
|
||||
or Ubuntu Linux this can be done via.
|
||||
|
||||
`apt-get install php-wikidiff2`
|
||||
|
||||
You may have to also run. `sudo phpenmod wikidiff2`.
|
||||
|
||||
Tests \-\-\--To run tests:
|
||||
|
||||
python -m unittest test.Wikiq_Unit_Test
|
||||
|
||||
## TODO:
|
||||
|
||||
1. \[\] Output metadata about the run. What parameters were used? What
|
||||
versions of deltas?
|
||||
2. \[\] Url encoding by default
|
12
php.ini
12
php.ini
@ -1,12 +0,0 @@
|
||||
; wikidiff2 extension settings
|
||||
wikidiff2.change_threshold = 0.2
|
||||
wikidiff2.moved_line_threshold = 0.4
|
||||
wikidiff2.moved_paragraph_detection_cutoff = 5000000
|
||||
wikidiff2.max_word_level_diff_complexity = 40000000
|
||||
wikidiff2.max_split_size = 1
|
||||
wikidiff2.initial_split_threshold = 0.1
|
||||
wikidiff2.final_split_threshold = 0.6
|
||||
|
||||
; It is possible this limit will need to be larger for some pages.
|
||||
post_max_size = 10000M
|
||||
opcache.enable=0
|
@ -17,7 +17,18 @@ dependencies = [
|
||||
"yamlconf>=0.2.6",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
wikiq = "wikiq:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/wikiq"]
|
||||
|
||||
[tool.uv.sources]
|
||||
|
||||
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
|
||||
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
|
||||
deltas = { git = "https://github.com/groceryheist/deltas" }
|
||||
|
@ -23,9 +23,9 @@ from deltas.tokenizers import wikitext_split
|
||||
import mwpersistence
|
||||
import mwreverts
|
||||
|
||||
import tables
|
||||
from tables import RevisionTable
|
||||
from wiki_diff_matcher import WikiDiffMatcher
|
||||
import wikiq.tables as tables
|
||||
from wikiq.tables import RevisionTable
|
||||
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
||||
|
||||
TO_ENCODE = ('title', 'editor')
|
||||
PERSISTENCE_RADIUS = 7
|
||||
@ -352,7 +352,7 @@ class WikiqParser:
|
||||
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
|
||||
|
||||
if self.diff:
|
||||
from diff_pyarrow_schema import diff_field
|
||||
from wikiq.diff_pyarrow_schema import diff_field
|
||||
schema = schema.append(diff_field)
|
||||
|
||||
# Add regex fields to the schema.
|
@ -12,7 +12,7 @@ from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
# Make references to files and wikiq relative to this file, not to the current working directory.
|
||||
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
||||
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
|
||||
WIKIQ: Final[str] = os.path.join(os.path.join(TEST_DIR,".."), "src/wikiq/__init__.py")
|
||||
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
|
||||
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
|
||||
|
||||
|
@ -6,7 +6,7 @@ import pytest_asyncio
|
||||
from typing import List
|
||||
from deltas import Delete, Equal, Insert, wikitext_split
|
||||
from mwpersistence import Token
|
||||
from wiki_diff_matcher import WikiDiffMatcher
|
||||
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
||||
|
||||
def _replace_whitespace(match):
|
||||
if match.group(1): # If spaces matched (e.g., ' ')
|
||||
|
Loading…
Reference in New Issue
Block a user