move code to dedicated src/ folder

2025-11-01 15:42:43 +00:00 · 2025-05-13 16:51:53 +02:00
parent 38901474c6
commit 10fbef63b3
11 changed files with 98 additions and 2 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+import os
+import sys
+import re
+import zlib
+import bz2
+import asyncio
+import logging
+import importlib
+import xml.sax
+from pathlib import Path
+from dotenv import load_dotenv
+import aiohttp
+from transformers import fetch_mappings, WikiDumpHandler, WikivoyageParser
+
+
+logger = logging.getLogger(__name__)
+
+def gather_handler_kwargs(handler_name: str) -> dict:
+    """
+    Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
+    E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
+    """
+    prefix = f"HANDLER_{handler_name.upper()}_"
+    kwargs = {}
+
+    for env_key, val in os.environ.items():
+        if not env_key.startswith(prefix):
+            continue
+        param = env_key.replace(prefix, "").lower()
+        # cast ints
+        if val.isdigit():
+            val = int(val)
+        # cast bools
+        elif val.lower() in ("true", "false"):
+            val = val.lower() == "true"
+        kwargs[param] = val
+    logger.debug(f"Handler kwargs: {kwargs}")
+    return kwargs
+
+
+async def process_dump(
+    mappings: dict[str, str], handler, max_concurrent: int
+):
+    """
+    Stream-download the bzip2-compressed XML dump and feed to SAX.
+    """
+    xml_url = (
+        "https://dumps.wikimedia.org/"
+        "enwikivoyage/latest/"
+        "enwikivoyage-latest-pages-articles.xml.bz2"
+    )
+    decomp = bz2.BZ2Decompressor()
+    sax_parser = xml.sax.make_parser()
+    dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
+    sax_parser.setContentHandler(dump_handler)
+
+    async with aiohttp.ClientSession() as session:
+        async with session.get(xml_url) as resp:
+            resp.raise_for_status()
+            async for chunk in resp.content.iter_chunked(1024 * 1024):
+                data = decomp.decompress(chunk)
+                if not data:
+                    continue
+                text = data.decode("utf-8", errors="ignore")
+                sax_parser.feed(text)
+    sax_parser.close()
+    if dump_handler.tasks:
+        await asyncio.gather(*dump_handler.tasks)
+
+async def main():
+    # 1. Which handler to load?
+    handler_name = os.getenv("HANDLER")
+    if not handler_name:
+        logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
+        sys.exit(1)
+
+    # 2. Dynamic import
+    module_path = f"output_handlers.{handler_name}"
+    try:
+        mod = importlib.import_module(module_path)
+    except ImportError as e:
+        logger.error(f"Error loading handler module {module_path}: {e}")
+        sys.exit(1)
+
+    # 3. Find the class: e.g. "sftp" → "SftpHandler"
+    class_name = handler_name.title().replace("_", "") + "Handler"
+    if not hasattr(mod, class_name):
+        logger.error(f"{module_path} defines no class {class_name}")
+        sys.exit(1)
+    HandlerCls = getattr(mod, class_name)
+
+    logger.info(f"Using handler from {module_path}")
+
+    # 4. Build kwargs from ENV
+    handler_kwargs = gather_handler_kwargs(handler_name)
+
+    # 5. Instantiate
+    handler = HandlerCls(**handler_kwargs)
+
+    # 6. read concurrency setting
+    try:
+        max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
+    except ValueError:
+        raise ValueError("MAX_CONCURRENT must be an integer")
+
+    if max_conc < 0:
+        raise ValueError("MAX_CONCURRENT must be >= 0")
+
+
+    # 7. Fetch mappings
+    logger.info("Fetching mappings from SQL dump…")
+    mappings = await fetch_mappings()
+    logger.info(f"Got {len(mappings)} wikibase_item mappings.")
+
+    # 8. Stream & split the XML dump
+    logger.info("Processing XML dump…")
+    await process_dump(mappings, handler, max_conc)
+
+    # 5. Finish up
+    await handler.close()
+    logger.info("All done.")
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    if os.getenv("DEBUG"):
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    asyncio.run(main())
--- a/src/output_handlers/init.py
+++ b/src/output_handlers/init.py
@@ -0,0 +1,3 @@
+from .base_handler import BaseHandler
+from .filesystem import FilesystemHandler
+from .bunny_storage import BunnyStorageHandler
--- a/src/output_handlers/base_handler.py
+++ b/src/output_handlers/base_handler.py
@@ -0,0 +1,65 @@
+"""Reference handler for output handlers."""
+from abc import ABC, abstractmethod
+import logging
+
+
+
+class BaseHandler(ABC):
+    """
+    Abstract base class for output handlers. Defines the standardized interface that all output handlers must implement.
+    In particular, it requires the implementation of an asynchronous ("private") method `_write_entry` to write a single entry to the output.
+    """
+
+    logger = logging.getLogger(__name__)
+    _successful_writes = 0
+    _failed_writes = 0
+
+    def __init__(self, fail_on_error: bool = True, **kwargs):
+        """
+        Initializes the BaseHandler with optional parameters.
+
+        Args:
+            fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True.
+            **kwargs: Additional keyword arguments for specific handler implementations.
+        """
+        self.fail_on_error = fail_on_error
+
+
+    @abstractmethod
+    async def _write_entry(self, entry: dict, uid: str) -> bool:
+        """
+        Asynchronously writes a single entry to the output. This method should gracefully handle any exceptions that may occur during the writing process and simply return False if an error occurs.
+
+        Args:
+            entry (dict): The entry to write (will be JSON-encoded).
+            uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended. 
+        Returns:
+            bool: True if the entry was written successfully, False otherwise.
+        """
+        pass
+
+
+    async def write_entry(self, entry: dict, uid: str):
+        """
+        Public method to write an entry to the output. It handles exceptions and logs errors.
+
+        Args:
+            entry (dict): The entry to write (will be JSON-encoded).
+            uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended. 
+        """
+        success = await self._write_entry(entry, uid)
+        if success:
+            self.logger.debug(f"Successfully wrote entry with UID {uid}")
+            self._successful_writes += 1
+        else:
+            self.logger.error(f"Failed to write entry with UID {uid}")
+            self._failed_writes += 1
+            if self.fail_on_error:
+                raise Exception(f"Failed to write entry with UID {uid}")
+
+
+    async def close(self):
+        """
+        Closes the handler. This method should be overridden by subclasses if they need to perform any cleanup operations.
+        """
+        self.logger.info(f"Wrote {self._successful_writes+self._failed_writes} entries: {self._successful_writes} successful, {self._failed_writes} failed.")
--- a/src/output_handlers/bunny_storage.py
+++ b/src/output_handlers/bunny_storage.py
@@ -0,0 +1,55 @@
+import json
+import aiohttp
+from .base_handler import BaseHandler
+
+class BunnyStorageHandler(BaseHandler):
+    def __init__(
+        self,
+        region: str,
+        base_path: str,
+        api_key: str,
+        fail_on_error: bool = True,
+        keepalive_timeout: int = 75,
+    ):
+        super().__init__(fail_on_error=fail_on_error)
+        self.base_url = f"https://{region}.bunnycdn.com/{base_path}"
+        self.headers = {
+            "AccessKey": api_key,
+            "Content-Type": "application/json",
+            "accept": "application/json",
+        }
+
+        # initialized later, in a guaranteed async context
+        self._connector = None
+        self._session = None
+        self._keepalive_timeout = keepalive_timeout
+
+    async def setup_connector(self):
+        if self._session is None:
+            self._connector = aiohttp.TCPConnector(
+                # limit is implicitly set to 100
+                keepalive_timeout = self._keepalive_timeout,
+            )
+            self._session = aiohttp.ClientSession(connector=self._connector)
+
+    async def _write_entry(self, entry: dict, uid: str) -> bool:
+        await self.setup_connector()
+        payload = json.dumps(entry).encode("utf-8")
+        url = f"{self.base_url}/{uid}.json"
+
+        try:
+            async with self._session.put(url, data=payload, headers=self.headers) as resp:
+                if resp.status in (200, 201, 204):
+                    return True
+                body = await resp.text()
+                self.logger.error(f"Upload failed UID={uid} status={resp.status} body={body}")
+                return False
+
+        except Exception:
+            self.logger.exception(f"Exception while uploading UID={uid}")
+            return False
+
+    async def close(self):
+        await self._session.close()
+        await self._connector.close()
+        await super().close()
--- a/src/output_handlers/filesystem.py
+++ b/src/output_handlers/filesystem.py
@@ -0,0 +1,44 @@
+"""Handler that writes files to the filesystem."""
+from pathlib import Path
+import aiofiles
+from .base_handler import BaseHandler
+import json
+
+class FilesystemHandler(BaseHandler):
+    """
+    Handler that writes files to the filesystem.
+    """
+    def __init__(self, output_dir: str, **kwargs):
+        """
+        Initializes the FileSystemHandler with the specified output directory.
+
+        Args:
+            output_dir (str): The directory where files will be written.
+            **kwargs: Additional keyword arguments for the BaseHandler.
+        """
+        super().__init__(**kwargs)
+        self.output_dir = Path(output_dir)
+        # Ensure the target directory exists
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.logger.info(f"Output directory set to {self.output_dir}")
+
+
+    async def _write_entry(self, entry: dict, uid: str) -> bool:
+        """
+        Asynchronously writes a single entry to the filesystem.
+
+        Args:
+            entry (dict): The entry to write (will be JSON-encoded).
+            uid (str): The unique identifier for the entry.
+
+        Returns:
+            bool: True if the entry was written successfully, False otherwise.
+        """
+        try:
+            file_path = self.output_dir / f"{uid}.json"
+            async with aiofiles.open(file_path, 'w') as f:
+                await f.write(json.dumps(entry))
+            return True
+        except IOError as e:
+            self.logger.error(f"Error writing entry {uid}: {e}")
+            return False
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -0,0 +1,3 @@
+from .fetch_mappings import fetch_mappings
+from .wiki_dump_handler import WikiDumpHandler
+from .parser import WikivoyageParser
--- a/src/transformers/fetch_mappings.py
+++ b/src/transformers/fetch_mappings.py
@@ -0,0 +1,42 @@
+from logging import getLogger
+import zlib
+import re
+import aiohttp
+
+logger = getLogger(__name__)
+
+async def fetch_mappings() -> dict[str, str]:
+    """
+    Download and gunzip the page_props SQL dump, extract
+    page→wikibase_item mappings.
+    """
+    sql_url = (
+        "https://dumps.wikimedia.org/"
+        "enwikivoyage/latest/"
+        "enwikivoyage-latest-page_props.sql.gz"
+    )
+    # decompress gzip
+    decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
+    # regex for tuples: (page,'prop','value',NULL_or_number)
+    tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
+    buffer = ""
+    mappings: dict[str, str] = {}
+
+    async with aiohttp.ClientSession() as session:
+        async with session.get(sql_url) as resp:
+            resp.raise_for_status()
+            async for chunk in resp.content.iter_chunked(1024 * 1024):
+                data = decomp.decompress(chunk)
+                if not data:
+                    continue
+                text = data.decode("utf-8", errors="ignore")
+                buffer += text
+                for m in tuple_re.finditer(buffer):
+                    page_id, prop, value = m.group(1), m.group(2), m.group(3)
+                    if prop == "wikibase_item":
+                        logger.debug(f"Found mapping {page_id} -> {value}")
+                        mappings[page_id] = value
+                # keep tail to handle split tuples
+                if len(buffer) > 1000:
+                    buffer = buffer[-1000:]
+    return mappings
--- a/src/transformers/parser.py
+++ b/src/transformers/parser.py
@@ -0,0 +1,348 @@
+"""Where the magic happens: parsing wikitext into a structured JSON tree."""
+import mwparserfromhell as mwp
+import mwparserfromhell.nodes as nodes
+import json
+from typing import Dict
+
+
+DOCUMENT_TEMPLATES = [
+    "pagebanner", "mapframe", "routebox", "geo", "isPartOf",
+    "usablecity", "guidecity", "outlinecity"
+]
+LISTING_TEMPLATES = [
+    "see", "do", "buy", "eat", "drink", "sleep", "listing"
+]
+
+class WikivoyageParser:
+    """
+    A parser for Wikivoyage wikitext to JSON tree structure.
+    This class uses mwparserfromhell to parse the wikitext and convert it into a structured JSON format.
+    """
+    def __init__(self):
+        self.root = {
+            "type": "root",
+            "properties": {},
+            "children": []
+        }
+        self.current_section = self.root
+
+    def parse(self, wikitext: str) -> Dict:
+        """Parse wikitext and return structured JSON tree"""
+        self.root = {
+            "type": "root",
+            "properties": {},
+            "children": []
+        }
+        self.current_section = self.root
+        
+        # Parse the wikitext
+        parsed = mwp.parse(wikitext)
+        
+        # Process the parsed content
+        self._process_nodes(parsed)
+        
+        return self.root
+
+    def _process_nodes(self, wikicode):
+        """Process all nodes in the wikicode"""
+        current_text = ""
+        
+        for node in wikicode.nodes:
+            # Handle different node types
+            if isinstance(node, nodes.heading.Heading):
+                # First flush any pending text
+                if current_text:
+                    self._add_text_node(current_text)
+                    current_text = ""
+                
+                # Create new section
+                self._handle_heading(node)
+                
+            elif isinstance(node, nodes.template.Template):
+                # First flush any pending text
+                if current_text:
+                    self._add_text_node(current_text)
+                    current_text = ""
+                
+                # Handle template
+                self._handle_template(node)
+                
+            elif isinstance(node, nodes.text.Text):
+                # Accumulate text
+                current_text += str(node.value)
+                
+            elif isinstance(node, nodes.tag.Tag):
+                # Handle tag (potential styling)
+                tag_text = self._convert_tag_to_markdown(node)
+                current_text += tag_text
+                
+            elif isinstance(node, nodes.wikilink.Wikilink):
+                # Handle wikilink
+                link_text = self._convert_wikilink_to_markdown(node)
+                current_text += link_text
+                
+            elif isinstance(node, nodes.external_link.ExternalLink):
+                # Handle external link
+                link_text = self._convert_external_link_to_markdown(node)
+                current_text += link_text
+                
+            elif isinstance(node, nodes.comment.Comment):
+                # Skip comments
+                pass
+                
+            else:
+                # Process other nodes as text
+                current_text += str(node)
+        
+        # Add any remaining text
+        if current_text:
+            self._add_text_node(current_text)
+
+    def _add_text_node(self, text: str):
+        """Add a text node to the current section"""
+        # Avoid adding empty text nodes
+        if not text.strip():
+            return
+            
+        text_node = {
+            "type": "text",
+            "properties": {
+                "markdown": text.strip()
+            },
+            "children": []
+        }
+        
+        self.current_section["children"].append(text_node)
+
+    def _handle_heading(self, heading_node):
+        """Handle a heading node by creating a new section"""
+        level = heading_node.level
+        title = str(heading_node.title).strip()
+        
+        # Create new section node
+        section = {
+            "type": "section",
+            "properties": {
+                "title": title,
+                "level": level
+            },
+            "children": []
+        }
+        
+        # Find the appropriate parent section based on level
+        parent = self.root
+        
+        # If the level is 1, the parent is the root
+        if level > 1:
+            # Start from root and traverse the tree
+            current = self.root
+            current_level = 0
+            
+            for child in reversed(self._get_all_sections()):
+                child_level = child["properties"]["level"]
+                if child_level < level:
+                    parent = child
+                    break
+        
+        # Add the section to its parent
+        parent["children"].append(section)
+        
+        # Update current section
+        self.current_section = section
+
+    def _get_all_sections(self):
+        """Get all sections in the document in the order they appear"""
+        sections = []
+        
+        def collect_sections(node):
+            if node["type"] == "section":
+                sections.append(node)
+            for child in node["children"]:
+                if child["type"] == "section":
+                    collect_sections(child)
+        
+        collect_sections(self.root)
+        return sections
+
+    def _handle_template(self, template_node):
+        """Handle a template node"""
+        template_name = str(template_node.name).strip().lower()
+        
+        # Check if it's a document-wide template
+        if template_name in DOCUMENT_TEMPLATES:
+            self._handle_document_template(template_node)
+            return
+            
+        # Check if it's a listing template
+        if template_name in LISTING_TEMPLATES:
+            self._handle_listing_template(template_node)
+            return
+            
+        # Handle other templates as regular nodes
+        self._handle_other_template(template_node)
+
+    def _handle_document_template(self, template_node):
+        """Handle document-wide templates by adding to root properties"""
+        template_name = str(template_node.name).strip().lower()
+        
+        # Extract parameters
+        params = {}
+        for param in template_node.params:
+            name = str(param.name).strip()
+            value = str(param.value).strip()
+            params[name] = value
+            
+        # Add to root properties
+        if template_name not in self.root["properties"]:
+            self.root["properties"][template_name] = {}
+            
+        self.root["properties"][template_name] = params
+
+    def _handle_listing_template(self, template_node):
+        """Handle listing templates (see, do, buy, eat, drink, sleep)"""
+        template_name = str(template_node.name).strip().lower()
+        
+        # Extract parameters
+        properties = {}
+        for param in template_node.params:
+            name = str(param.name).strip()
+            value = str(param.value).strip()
+            
+            # Convert content to markdown if it's in the 'content' parameter
+            if name == "content":
+                value = self._convert_wikicode_to_markdown(param.value)
+                
+            properties[name] = value
+            
+        # Create listing node
+        listing_node = {
+            "type": template_name,
+            "properties": properties,
+            "children": []
+        }
+        
+        # Add to current section
+        self.current_section["children"].append(listing_node)
+
+    def _handle_other_template(self, template_node):
+        """Handle other templates as general template nodes"""
+        template_name = str(template_node.name).strip().lower()
+        
+        # Extract parameters
+        properties = {
+            "name": template_name,
+            "params": {}
+        }
+        
+        for param in template_node.params:
+            name = str(param.name).strip()
+            value = str(param.value).strip()
+            properties["params"][name] = value
+            
+        # Create template node
+        template_node = {
+            "type": "template",
+            "properties": properties,
+            "children": []
+        }
+        
+        # Add to current section
+        self.current_section["children"].append(template_node)
+
+    def _convert_wikicode_to_markdown(self, wikicode) -> str:
+        """Convert wikicode to markdown"""
+        markdown = ""
+        
+        for node in wikicode.nodes:
+            if isinstance(node, nodes.text.Text):
+                markdown += str(node.value)
+                
+            elif isinstance(node, nodes.tag.Tag):
+                markdown += self._convert_tag_to_markdown(node)
+                
+            elif isinstance(node, nodes.wikilink.Wikilink):
+                markdown += self._convert_wikilink_to_markdown(node)
+                
+            elif isinstance(node, nodes.external_link.ExternalLink):
+                markdown += self._convert_external_link_to_markdown(node)
+                
+            else:
+                # For other nodes, just use their string representation
+                markdown += str(node)
+                
+        return markdown.strip()
+
+    def _convert_tag_to_markdown(self, tag_node) -> str:
+        """Convert HTML tag to markdown"""
+        tag = str(tag_node.tag).lower()
+        content = str(tag_node.contents)
+        
+        # Convert the content recursively to handle nested tags
+        if tag_node.contents:
+            content = self._convert_wikicode_to_markdown(tag_node.contents)
+            
+        # Handle different tags
+        if tag == 'b' or tag == 'strong':
+            return f"**{content}**"
+        elif tag == 'i' or tag == 'em':
+            return f"*{content}*"
+        elif tag == 'u':
+            return f"_{content}_"
+        elif tag == 'strike' or tag == 's' or tag == 'del':
+            return f"~~{content}~~"
+        elif tag == 'code':
+            return f"`{content}`"
+        elif tag == 'pre':
+            return f"```\n{content}\n```"
+        elif tag == 'br':
+            return "\n"
+        elif tag == 'hr':
+            return "\n---\n"
+        elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            level = int(tag[1])
+            return f"\n{'#' * level} {content}\n"
+        elif tag == 'a':
+            href = ""
+            for attr in tag_node.attributes:
+                if str(attr.name).lower() == 'href':
+                    href = str(attr.value)
+                    break
+            return f"[{content}]({href})"
+        elif tag == 'img':
+            src = alt = ""
+            for attr in tag_node.attributes:
+                if str(attr.name).lower() == 'src':
+                    src = str(attr.value)
+                elif str(attr.name).lower() == 'alt':
+                    alt = str(attr.value)
+            return f"![{alt}]({src})"
+        else:
+            # For unknown tags, just return the content
+            return content
+
+    def _convert_wikilink_to_markdown(self, wikilink_node) -> str:
+        """Convert wikilink to markdown"""
+        title = str(wikilink_node.title)
+        
+        if wikilink_node.text:
+            text = str(wikilink_node.text)
+            return f"[{text}]({title})"
+        else:
+            return f"[{title}]({title})"
+
+    def _convert_external_link_to_markdown(self, link_node) -> str:
+        """Convert external link to markdown"""
+        url = str(link_node.url)
+        
+        if link_node.title:
+            title = str(link_node.title)
+            return f"[{title}]({url})"
+        else:
+            return url
+
+    def export_json(self, root=None, indent=2) -> str:
+        """Export the tree as JSON string"""
+        if root is None:
+            root = self.root
+            
+        return json.dumps(root, indent=indent)
--- a/src/transformers/wiki_dump_handler.py
+++ b/src/transformers/wiki_dump_handler.py
@@ -0,0 +1,96 @@
+from logging import getLogger
+import xml.sax
+import asyncio
+from .parser import WikivoyageParser
+
+logger = getLogger(__name__)
+
+class WikiDumpHandler(xml.sax.ContentHandler):
+    """
+    SAX handler that, for each <page> whose <id> is in mappings,
+    collects the <text> and schedules an async task to parse
+    and write via the user‐supplied handler.
+    """
+
+    def __init__(self, mappings, handler, max_concurrent):
+        super().__init__()
+        self.mappings = mappings
+        self.handler = handler
+        self.sem = (
+            asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
+        )
+        self.tasks: list[asyncio.Task] = []
+
+        self.currentTag: str | None = None
+        self.inPage = False
+        self.inRevision = False
+        self.inText = False
+        self.currentPageId: str | None = None
+        self.currentText: list[str] = []
+
+    def startElement(self, name, attrs):
+        self.currentTag = name
+        if name == "page":
+            logger.debug("start page")
+            self.inPage = True
+            self.currentPageId = None
+            self.currentText = []
+        elif name == "revision":
+            logger.debug("start revision")
+            self.inRevision = True
+        elif name == "text" and self.inRevision:
+            logger.debug("start text")
+            self.inText = True
+
+    def endElement(self, name):
+        if name == "page":
+            logger.debug("end page")
+            pid = self.currentPageId
+            if pid and pid in self.mappings:
+                wd_id = self.mappings[pid]
+                text = "".join(self.currentText)
+                logger.debug(f"scheduled {wd_id} for handling")
+                # schedule processing
+                if self.sem:
+                    task = asyncio.create_task(self._bounded_process(text, wd_id))
+                else:
+                    task = asyncio.create_task(self._process(text, wd_id))
+                self.tasks.append(task)
+            else:
+                logger.debug(f"page {pid} without wikidata id, skipping...")
+            # reset
+            self.inPage = self.inRevision = self.inText = False
+            self.currentPageId = None
+            self.currentText = []
+        elif name == "revision":
+            logger.debug("end revision")
+            self.inRevision = False
+        elif name == "text":
+            logger.debug("end text")
+            self.inText = False
+        self.currentTag = None
+
+    def characters(self, content):
+        # Only filter whitespace for ID fields, preserve all content for text
+        if (
+            self.currentTag == "id"
+            and self.inPage
+            and not self.inRevision
+            and not self.currentPageId
+        ):
+            content_stripped = content.strip()
+            if content_stripped:  # Only process non-empty ID content
+                self.currentPageId = content_stripped
+        elif self.inText:
+            # Always append text content, even if it's just whitespace or newlines
+            self.currentText.append(content)
+
+    async def _process(self, text: str, uid: str):
+        parser = WikivoyageParser()
+        entry = parser.parse(text)
+        await self.handler.write_entry(entry, uid)
+
+    async def _bounded_process(self, text: str, uid: str):
+        # Only run N at once
+        async with self.sem:
+            await self._process(text, uid)