From 10fbef63b3a175babade2f88e525411f575189ba Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Tue, 13 May 2025 16:51:53 +0200 Subject: [PATCH] move code to dedicated src/ folder --- .github/workflows/test-parser.yaml | 2 +- README.md | 2 +- main.py => src/main.py | 0 .../output_handlers}/__init__.py | 0 .../output_handlers}/base_handler.py | 0 .../output_handlers}/bunny_storage.py | 0 .../output_handlers}/filesystem.py | 0 .../transformers}/__init__.py | 0 .../transformers}/fetch_mappings.py | 0 {transformers => src/transformers}/parser.py | 0 src/transformers/wiki_dump_handler.py | 96 +++++++++++++++++++ 11 files changed, 98 insertions(+), 2 deletions(-) rename main.py => src/main.py (100%) rename {output_handlers => src/output_handlers}/__init__.py (100%) rename {output_handlers => src/output_handlers}/base_handler.py (100%) rename {output_handlers => src/output_handlers}/bunny_storage.py (100%) rename {output_handlers => src/output_handlers}/filesystem.py (100%) rename {transformers => src/transformers}/__init__.py (100%) rename {transformers => src/transformers}/fetch_mappings.py (100%) rename {transformers => src/transformers}/parser.py (100%) create mode 100644 src/transformers/wiki_dump_handler.py diff --git a/.github/workflows/test-parser.yaml b/.github/workflows/test-parser.yaml index e6ee580..70a7c77 100644 --- a/.github/workflows/test-parser.yaml +++ b/.github/workflows/test-parser.yaml @@ -20,4 +20,4 @@ jobs: run: uv sync --locked --dev - name: Run tests - run: PYTHONPATH=. uv run pytest + run: PYTHONPATH=src uv run pytest diff --git a/README.md b/README.md index 69a1aaa..d0096bd 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ See [docs](docs) for more information on how to use this utility. ## Testing -Run `PYTHONPATH=. pytest` from inside the venv +Run `PYTHONPATH=src pytest` from inside the venv, or directly call `PYTHONPATH=src uv run -- pytest`. ## License diff --git a/main.py b/src/main.py similarity index 100% rename from main.py rename to src/main.py diff --git a/output_handlers/__init__.py b/src/output_handlers/__init__.py similarity index 100% rename from output_handlers/__init__.py rename to src/output_handlers/__init__.py diff --git a/output_handlers/base_handler.py b/src/output_handlers/base_handler.py similarity index 100% rename from output_handlers/base_handler.py rename to src/output_handlers/base_handler.py diff --git a/output_handlers/bunny_storage.py b/src/output_handlers/bunny_storage.py similarity index 100% rename from output_handlers/bunny_storage.py rename to src/output_handlers/bunny_storage.py diff --git a/output_handlers/filesystem.py b/src/output_handlers/filesystem.py similarity index 100% rename from output_handlers/filesystem.py rename to src/output_handlers/filesystem.py diff --git a/transformers/__init__.py b/src/transformers/__init__.py similarity index 100% rename from transformers/__init__.py rename to src/transformers/__init__.py diff --git a/transformers/fetch_mappings.py b/src/transformers/fetch_mappings.py similarity index 100% rename from transformers/fetch_mappings.py rename to src/transformers/fetch_mappings.py diff --git a/transformers/parser.py b/src/transformers/parser.py similarity index 100% rename from transformers/parser.py rename to src/transformers/parser.py diff --git a/src/transformers/wiki_dump_handler.py b/src/transformers/wiki_dump_handler.py new file mode 100644 index 0000000..eecf022 --- /dev/null +++ b/src/transformers/wiki_dump_handler.py @@ -0,0 +1,96 @@ +from logging import getLogger +import xml.sax +import asyncio +from .parser import WikivoyageParser + +logger = getLogger(__name__) + +class WikiDumpHandler(xml.sax.ContentHandler): + """ + SAX handler that, for each whose is in mappings, + collects the and schedules an async task to parse + and write via the user‐supplied handler. + """ + + def __init__(self, mappings, handler, max_concurrent): + super().__init__() + self.mappings = mappings + self.handler = handler + self.sem = ( + asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None + ) + self.tasks: list[asyncio.Task] = [] + + self.currentTag: str | None = None + self.inPage = False + self.inRevision = False + self.inText = False + self.currentPageId: str | None = None + self.currentText: list[str] = [] + + def startElement(self, name, attrs): + self.currentTag = name + if name == "page": + logger.debug("start page") + self.inPage = True + self.currentPageId = None + self.currentText = [] + elif name == "revision": + logger.debug("start revision") + self.inRevision = True + elif name == "text" and self.inRevision: + logger.debug("start text") + self.inText = True + + def endElement(self, name): + if name == "page": + logger.debug("end page") + pid = self.currentPageId + if pid and pid in self.mappings: + wd_id = self.mappings[pid] + text = "".join(self.currentText) + logger.debug(f"scheduled {wd_id} for handling") + # schedule processing + if self.sem: + task = asyncio.create_task(self._bounded_process(text, wd_id)) + else: + task = asyncio.create_task(self._process(text, wd_id)) + self.tasks.append(task) + else: + logger.debug(f"page {pid} without wikidata id, skipping...") + # reset + self.inPage = self.inRevision = self.inText = False + self.currentPageId = None + self.currentText = [] + elif name == "revision": + logger.debug("end revision") + self.inRevision = False + elif name == "text": + logger.debug("end text") + self.inText = False + self.currentTag = None + + def characters(self, content): + # Only filter whitespace for ID fields, preserve all content for text + if ( + self.currentTag == "id" + and self.inPage + and not self.inRevision + and not self.currentPageId + ): + content_stripped = content.strip() + if content_stripped: # Only process non-empty ID content + self.currentPageId = content_stripped + elif self.inText: + # Always append text content, even if it's just whitespace or newlines + self.currentText.append(content) + + async def _process(self, text: str, uid: str): + parser = WikivoyageParser() + entry = parser.parse(text) + await self.handler.write_entry(entry, uid) + + async def _bounded_process(self, text: str, uid: str): + # Only run N at once + async with self.sem: + await self._process(text, uid)