Add noargs test for ikwiki
This way we can ensure that the parquet code outputs equivalent output. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
13
wikiq
13
wikiq
@@ -14,7 +14,7 @@ from io import TextIOWrapper
|
||||
from subprocess import Popen, PIPE
|
||||
from collections import deque
|
||||
from hashlib import sha1
|
||||
from typing import Any, IO, TextIO, Iterable
|
||||
from typing import Any, IO, TextIO
|
||||
|
||||
from mwxml import Dump
|
||||
|
||||
@@ -387,7 +387,7 @@ class WikiqParser:
|
||||
namespaces: list[int] | None = None,
|
||||
revert_radius: int = 15,
|
||||
output_parquet: bool = True,
|
||||
parquet_buffer_size: int=2000):
|
||||
parquet_buffer_size: int = 2000):
|
||||
"""
|
||||
Parameters:
|
||||
persist : what persistence method to use. Takes a PersistMethod value
|
||||
@@ -427,8 +427,8 @@ class WikiqParser:
|
||||
# make_dataclass is a function that defines a new dataclass type.
|
||||
# here we extend the type we have already chosen and add the regular expression types
|
||||
self.revdata_type: type = dc.make_dataclass('RevData_Parser',
|
||||
fields=regex_fields,
|
||||
bases=(revdata_type,))
|
||||
fields=regex_fields,
|
||||
bases=(revdata_type,))
|
||||
|
||||
# we also need to make sure that we have the right pyarrow schema
|
||||
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
|
||||
@@ -544,11 +544,12 @@ class WikiqParser:
|
||||
# Iterate through a page's revisions
|
||||
for rev in page:
|
||||
|
||||
# create a new data object instead of a dictionary.
|
||||
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
|
||||
# create a new data object instead of a dictionary.
|
||||
rev_data = self.revdata_type(revid=rev.id,
|
||||
date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
|
||||
articleid=page.id,
|
||||
editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id,
|
||||
editorid=editorid,
|
||||
title=page.title,
|
||||
deleted=rev.deleted.text,
|
||||
namespace=namespace
|
||||
|
||||
Reference in New Issue
Block a user