mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-06-08 00:44:04 +00:00
Merge branch 'feature/only-python' into feature/docker
This commit is contained in:
commit
789da95e49
@ -129,16 +129,18 @@ class WikiDumpHandler(xml.sax.ContentHandler):
|
|||||||
self.currentTag = None
|
self.currentTag = None
|
||||||
|
|
||||||
def characters(self, content):
|
def characters(self, content):
|
||||||
if not content.strip():
|
# Only filter whitespace for ID fields, preserve all content for text
|
||||||
return
|
|
||||||
if (
|
if (
|
||||||
self.currentTag == "id"
|
self.currentTag == "id"
|
||||||
and self.inPage
|
and self.inPage
|
||||||
and not self.inRevision
|
and not self.inRevision
|
||||||
and not self.currentPageId
|
and not self.currentPageId
|
||||||
):
|
):
|
||||||
self.currentPageId = content.strip()
|
content_stripped = content.strip()
|
||||||
|
if content_stripped: # Only process non-empty ID content
|
||||||
|
self.currentPageId = content_stripped
|
||||||
elif self.inText:
|
elif self.inText:
|
||||||
|
# Always append text content, even if it's just whitespace or newlines
|
||||||
self.currentText.append(content)
|
self.currentText.append(content)
|
||||||
|
|
||||||
async def _process(self, text: str, uid: str):
|
async def _process(self, text: str, uid: str):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user