mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-06-08 00:44:04 +00:00
move code to dedicated src/ folder
This commit is contained in:
parent
38901474c6
commit
10fbef63b3
2
.github/workflows/test-parser.yaml
vendored
2
.github/workflows/test-parser.yaml
vendored
@ -20,4 +20,4 @@ jobs:
|
|||||||
run: uv sync --locked --dev
|
run: uv sync --locked --dev
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: PYTHONPATH=. uv run pytest
|
run: PYTHONPATH=src uv run pytest
|
||||||
|
@ -18,7 +18,7 @@ See [docs](docs) for more information on how to use this utility.
|
|||||||
|
|
||||||
## Testing
|
## Testing
|
||||||
|
|
||||||
Run `PYTHONPATH=. pytest` from inside the venv
|
Run `PYTHONPATH=src pytest` from inside the venv, or directly call `PYTHONPATH=src uv run -- pytest`.
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
|
96
src/transformers/wiki_dump_handler.py
Normal file
96
src/transformers/wiki_dump_handler.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
from logging import getLogger
|
||||||
|
import xml.sax
|
||||||
|
import asyncio
|
||||||
|
from .parser import WikivoyageParser
|
||||||
|
|
||||||
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
class WikiDumpHandler(xml.sax.ContentHandler):
|
||||||
|
"""
|
||||||
|
SAX handler that, for each <page> whose <id> is in mappings,
|
||||||
|
collects the <text> and schedules an async task to parse
|
||||||
|
and write via the user‐supplied handler.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, mappings, handler, max_concurrent):
|
||||||
|
super().__init__()
|
||||||
|
self.mappings = mappings
|
||||||
|
self.handler = handler
|
||||||
|
self.sem = (
|
||||||
|
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
||||||
|
)
|
||||||
|
self.tasks: list[asyncio.Task] = []
|
||||||
|
|
||||||
|
self.currentTag: str | None = None
|
||||||
|
self.inPage = False
|
||||||
|
self.inRevision = False
|
||||||
|
self.inText = False
|
||||||
|
self.currentPageId: str | None = None
|
||||||
|
self.currentText: list[str] = []
|
||||||
|
|
||||||
|
def startElement(self, name, attrs):
|
||||||
|
self.currentTag = name
|
||||||
|
if name == "page":
|
||||||
|
logger.debug("start page")
|
||||||
|
self.inPage = True
|
||||||
|
self.currentPageId = None
|
||||||
|
self.currentText = []
|
||||||
|
elif name == "revision":
|
||||||
|
logger.debug("start revision")
|
||||||
|
self.inRevision = True
|
||||||
|
elif name == "text" and self.inRevision:
|
||||||
|
logger.debug("start text")
|
||||||
|
self.inText = True
|
||||||
|
|
||||||
|
def endElement(self, name):
|
||||||
|
if name == "page":
|
||||||
|
logger.debug("end page")
|
||||||
|
pid = self.currentPageId
|
||||||
|
if pid and pid in self.mappings:
|
||||||
|
wd_id = self.mappings[pid]
|
||||||
|
text = "".join(self.currentText)
|
||||||
|
logger.debug(f"scheduled {wd_id} for handling")
|
||||||
|
# schedule processing
|
||||||
|
if self.sem:
|
||||||
|
task = asyncio.create_task(self._bounded_process(text, wd_id))
|
||||||
|
else:
|
||||||
|
task = asyncio.create_task(self._process(text, wd_id))
|
||||||
|
self.tasks.append(task)
|
||||||
|
else:
|
||||||
|
logger.debug(f"page {pid} without wikidata id, skipping...")
|
||||||
|
# reset
|
||||||
|
self.inPage = self.inRevision = self.inText = False
|
||||||
|
self.currentPageId = None
|
||||||
|
self.currentText = []
|
||||||
|
elif name == "revision":
|
||||||
|
logger.debug("end revision")
|
||||||
|
self.inRevision = False
|
||||||
|
elif name == "text":
|
||||||
|
logger.debug("end text")
|
||||||
|
self.inText = False
|
||||||
|
self.currentTag = None
|
||||||
|
|
||||||
|
def characters(self, content):
|
||||||
|
# Only filter whitespace for ID fields, preserve all content for text
|
||||||
|
if (
|
||||||
|
self.currentTag == "id"
|
||||||
|
and self.inPage
|
||||||
|
and not self.inRevision
|
||||||
|
and not self.currentPageId
|
||||||
|
):
|
||||||
|
content_stripped = content.strip()
|
||||||
|
if content_stripped: # Only process non-empty ID content
|
||||||
|
self.currentPageId = content_stripped
|
||||||
|
elif self.inText:
|
||||||
|
# Always append text content, even if it's just whitespace or newlines
|
||||||
|
self.currentText.append(content)
|
||||||
|
|
||||||
|
async def _process(self, text: str, uid: str):
|
||||||
|
parser = WikivoyageParser()
|
||||||
|
entry = parser.parse(text)
|
||||||
|
await self.handler.write_entry(entry, uid)
|
||||||
|
|
||||||
|
async def _bounded_process(self, text: str, uid: str):
|
||||||
|
# Only run N at once
|
||||||
|
async with self.sem:
|
||||||
|
await self._process(text, uid)
|
Loading…
x
Reference in New Issue
Block a user