refactor into src-layout package.
This commit is contained in:
parent
56c90fe1cc
commit
c597a6b7f4
42
README.md
Normal file
42
README.md
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
When you install this from git, you will need to first clone the
|
||||||
|
repository:
|
||||||
|
|
||||||
|
git clone git://projects.mako.cc/mediawiki_dump_tools
|
||||||
|
|
||||||
|
From within the repository working directory, initiatlize and set up the
|
||||||
|
submodule like:
|
||||||
|
|
||||||
|
git submodule init
|
||||||
|
git submodule update
|
||||||
|
|
||||||
|
Wikimedia dumps are usually in a compressed format such as 7z (most
|
||||||
|
common), gz, or bz2. Wikiq uses your computer\'s compression software to
|
||||||
|
read these files. Therefore wikiq depends on [7za]{.title-ref},
|
||||||
|
[gzcat]{.title-ref}, and [zcat]{.title-ref}.
|
||||||
|
|
||||||
|
# Dependencies
|
||||||
|
|
||||||
|
These non-Python dependencies must be installed on your system for wikiq
|
||||||
|
and its associated tests to work.
|
||||||
|
|
||||||
|
- 7zip
|
||||||
|
- ffmpeg
|
||||||
|
|
||||||
|
A new diff engine based on [\_wikidiff2]{.title-ref} can be used for
|
||||||
|
word-persistence. Wikiq can also output the diffs between each page
|
||||||
|
revision. This requires installing Wikidiff 2 on your system. On Debian
|
||||||
|
or Ubuntu Linux this can be done via.
|
||||||
|
|
||||||
|
`apt-get install php-wikidiff2`
|
||||||
|
|
||||||
|
You may have to also run. `sudo phpenmod wikidiff2`.
|
||||||
|
|
||||||
|
Tests \-\-\--To run tests:
|
||||||
|
|
||||||
|
python -m unittest test.Wikiq_Unit_Test
|
||||||
|
|
||||||
|
## TODO:
|
||||||
|
|
||||||
|
1. \[\] Output metadata about the run. What parameters were used? What
|
||||||
|
versions of deltas?
|
||||||
|
2. \[\] Url encoding by default
|
12
php.ini
12
php.ini
@ -1,12 +0,0 @@
|
|||||||
; wikidiff2 extension settings
|
|
||||||
wikidiff2.change_threshold = 0.2
|
|
||||||
wikidiff2.moved_line_threshold = 0.4
|
|
||||||
wikidiff2.moved_paragraph_detection_cutoff = 5000000
|
|
||||||
wikidiff2.max_word_level_diff_complexity = 40000000
|
|
||||||
wikidiff2.max_split_size = 1
|
|
||||||
wikidiff2.initial_split_threshold = 0.1
|
|
||||||
wikidiff2.final_split_threshold = 0.6
|
|
||||||
|
|
||||||
; It is possible this limit will need to be larger for some pages.
|
|
||||||
post_max_size = 10000M
|
|
||||||
opcache.enable=0
|
|
@ -17,7 +17,18 @@ dependencies = [
|
|||||||
"yamlconf>=0.2.6",
|
"yamlconf>=0.2.6",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
wikiq = "wikiq:main"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["src/wikiq"]
|
||||||
|
|
||||||
[tool.uv.sources]
|
[tool.uv.sources]
|
||||||
|
|
||||||
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
|
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
|
||||||
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
|
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
|
||||||
deltas = { git = "https://github.com/groceryheist/deltas" }
|
deltas = { git = "https://github.com/groceryheist/deltas" }
|
||||||
|
@ -23,9 +23,9 @@ from deltas.tokenizers import wikitext_split
|
|||||||
import mwpersistence
|
import mwpersistence
|
||||||
import mwreverts
|
import mwreverts
|
||||||
|
|
||||||
import tables
|
import wikiq.tables as tables
|
||||||
from tables import RevisionTable
|
from wikiq.tables import RevisionTable
|
||||||
from wiki_diff_matcher import WikiDiffMatcher
|
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
||||||
|
|
||||||
TO_ENCODE = ('title', 'editor')
|
TO_ENCODE = ('title', 'editor')
|
||||||
PERSISTENCE_RADIUS = 7
|
PERSISTENCE_RADIUS = 7
|
||||||
@ -352,7 +352,7 @@ class WikiqParser:
|
|||||||
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
|
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
|
||||||
|
|
||||||
if self.diff:
|
if self.diff:
|
||||||
from diff_pyarrow_schema import diff_field
|
from wikiq.diff_pyarrow_schema import diff_field
|
||||||
schema = schema.append(diff_field)
|
schema = schema.append(diff_field)
|
||||||
|
|
||||||
# Add regex fields to the schema.
|
# Add regex fields to the schema.
|
@ -12,7 +12,7 @@ from pandas.testing import assert_frame_equal, assert_series_equal
|
|||||||
|
|
||||||
# Make references to files and wikiq relative to this file, not to the current working directory.
|
# Make references to files and wikiq relative to this file, not to the current working directory.
|
||||||
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
||||||
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
|
WIKIQ: Final[str] = os.path.join(os.path.join(TEST_DIR,".."), "src/wikiq/__init__.py")
|
||||||
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
|
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
|
||||||
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
|
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import pytest_asyncio
|
|||||||
from typing import List
|
from typing import List
|
||||||
from deltas import Delete, Equal, Insert, wikitext_split
|
from deltas import Delete, Equal, Insert, wikitext_split
|
||||||
from mwpersistence import Token
|
from mwpersistence import Token
|
||||||
from wiki_diff_matcher import WikiDiffMatcher
|
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
||||||
|
|
||||||
def _replace_whitespace(match):
|
def _replace_whitespace(match):
|
||||||
if match.group(1): # If spaces matched (e.g., ' ')
|
if match.group(1): # If spaces matched (e.g., ' ')
|
||||||
|
Loading…
Reference in New Issue
Block a user