diff --git a/.github/workflows/test-parser.yaml b/.github/workflows/test-parser.yaml index e6ee580..70a7c77 100644 --- a/.github/workflows/test-parser.yaml +++ b/.github/workflows/test-parser.yaml @@ -20,4 +20,4 @@ jobs: run: uv sync --locked --dev - name: Run tests - run: PYTHONPATH=. uv run pytest + run: PYTHONPATH=src uv run pytest diff --git a/README.md b/README.md index 69a1aaa..d0096bd 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ See [docs](docs) for more information on how to use this utility. ## Testing -Run `PYTHONPATH=. pytest` from inside the venv +Run `PYTHONPATH=src pytest` from inside the venv, or directly call `PYTHONPATH=src uv run -- pytest`. ## License diff --git a/main.py b/src/main.py similarity index 100% rename from main.py rename to src/main.py diff --git a/output_handlers/__init__.py b/src/output_handlers/__init__.py similarity index 100% rename from output_handlers/__init__.py rename to src/output_handlers/__init__.py diff --git a/output_handlers/base_handler.py b/src/output_handlers/base_handler.py similarity index 100% rename from output_handlers/base_handler.py rename to src/output_handlers/base_handler.py diff --git a/output_handlers/bunny_storage.py b/src/output_handlers/bunny_storage.py similarity index 100% rename from output_handlers/bunny_storage.py rename to src/output_handlers/bunny_storage.py diff --git a/output_handlers/filesystem.py b/src/output_handlers/filesystem.py similarity index 100% rename from output_handlers/filesystem.py rename to src/output_handlers/filesystem.py diff --git a/transformers/__init__.py b/src/transformers/__init__.py similarity index 100% rename from transformers/__init__.py rename to src/transformers/__init__.py diff --git a/transformers/fetch_mappings.py b/src/transformers/fetch_mappings.py similarity index 100% rename from transformers/fetch_mappings.py rename to src/transformers/fetch_mappings.py diff --git a/transformers/parser.py b/src/transformers/parser.py similarity index 100% rename from transformers/parser.py rename to src/transformers/parser.py diff --git a/src/transformers/wiki_dump_handler.py b/src/transformers/wiki_dump_handler.py new file mode 100644 index 0000000..eecf022 --- /dev/null +++ b/src/transformers/wiki_dump_handler.py @@ -0,0 +1,96 @@ +from logging import getLogger +import xml.sax +import asyncio +from .parser import WikivoyageParser + +logger = getLogger(__name__) + +class WikiDumpHandler(xml.sax.ContentHandler): + """ + SAX handler that, for each <page> whose <id> is in mappings, + collects the <text> and schedules an async task to parse + and write via the userāsupplied handler. + """ + + def __init__(self, mappings, handler, max_concurrent): + super().__init__() + self.mappings = mappings + self.handler = handler + self.sem = ( + asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None + ) + self.tasks: list[asyncio.Task] = [] + + self.currentTag: str | None = None + self.inPage = False + self.inRevision = False + self.inText = False + self.currentPageId: str | None = None + self.currentText: list[str] = [] + + def startElement(self, name, attrs): + self.currentTag = name + if name == "page": + logger.debug("start page") + self.inPage = True + self.currentPageId = None + self.currentText = [] + elif name == "revision": + logger.debug("start revision") + self.inRevision = True + elif name == "text" and self.inRevision: + logger.debug("start text") + self.inText = True + + def endElement(self, name): + if name == "page": + logger.debug("end page") + pid = self.currentPageId + if pid and pid in self.mappings: + wd_id = self.mappings[pid] + text = "".join(self.currentText) + logger.debug(f"scheduled {wd_id} for handling") + # schedule processing + if self.sem: + task = asyncio.create_task(self._bounded_process(text, wd_id)) + else: + task = asyncio.create_task(self._process(text, wd_id)) + self.tasks.append(task) + else: + logger.debug(f"page {pid} without wikidata id, skipping...") + # reset + self.inPage = self.inRevision = self.inText = False + self.currentPageId = None + self.currentText = [] + elif name == "revision": + logger.debug("end revision") + self.inRevision = False + elif name == "text": + logger.debug("end text") + self.inText = False + self.currentTag = None + + def characters(self, content): + # Only filter whitespace for ID fields, preserve all content for text + if ( + self.currentTag == "id" + and self.inPage + and not self.inRevision + and not self.currentPageId + ): + content_stripped = content.strip() + if content_stripped: # Only process non-empty ID content + self.currentPageId = content_stripped + elif self.inText: + # Always append text content, even if it's just whitespace or newlines + self.currentText.append(content) + + async def _process(self, text: str, uid: str): + parser = WikivoyageParser() + entry = parser.parse(text) + await self.handler.write_entry(entry, uid) + + async def _bounded_process(self, text: str, uid: str): + # Only run N at once + async with self.sem: + await self._process(text, uid)