97 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from logging import getLogger
import xml.sax
import asyncio
from .parser import WikivoyageParser
logger = getLogger(__name__)
class WikiDumpHandler(xml.sax.ContentHandler):
"""
SAX handler that, for each <page> whose <id> is in mappings,
collects the <text> and schedules an async task to parse
and write via the usersupplied handler.
"""
def __init__(self, mappings, handler, max_concurrent):
super().__init__()
self.mappings = mappings
self.handler = handler
self.sem = (
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
)
self.tasks: list[asyncio.Task] = []
self.currentTag: str | None = None
self.inPage = False
self.inRevision = False
self.inText = False
self.currentPageId: str | None = None
self.currentText: list[str] = []
def startElement(self, name, attrs):
self.currentTag = name
if name == "page":
logger.debug("start page")
self.inPage = True
self.currentPageId = None
self.currentText = []
elif name == "revision":
logger.debug("start revision")
self.inRevision = True
elif name == "text" and self.inRevision:
logger.debug("start text")
self.inText = True
def endElement(self, name):
if name == "page":
logger.debug("end page")
pid = self.currentPageId
if pid and pid in self.mappings:
wd_id = self.mappings[pid]
text = "".join(self.currentText)
logger.debug(f"scheduled {wd_id} for handling")
# schedule processing
if self.sem:
task = asyncio.create_task(self._bounded_process(text, wd_id))
else:
task = asyncio.create_task(self._process(text, wd_id))
self.tasks.append(task)
else:
logger.debug(f"page {pid} without wikidata id, skipping...")
# reset
self.inPage = self.inRevision = self.inText = False
self.currentPageId = None
self.currentText = []
elif name == "revision":
logger.debug("end revision")
self.inRevision = False
elif name == "text":
logger.debug("end text")
self.inText = False
self.currentTag = None
def characters(self, content):
# Only filter whitespace for ID fields, preserve all content for text
if (
self.currentTag == "id"
and self.inPage
and not self.inRevision
and not self.currentPageId
):
content_stripped = content.strip()
if content_stripped: # Only process non-empty ID content
self.currentPageId = content_stripped
elif self.inText:
# Always append text content, even if it's just whitespace or newlines
self.currentText.append(content)
async def _process(self, text: str, uid: str):
parser = WikivoyageParser()
entry = parser.parse(text)
await self.handler.write_entry(entry, uid)
async def _bounded_process(self, text: str, uid: str):
# Only run N at once
async with self.sem:
await self._process(text, uid)