From 3330667fa7371bcfe8c6a3908b3cae2e578ea128 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bruce=20R=C3=B6ttgers?= Date: Wed, 30 Apr 2025 13:32:52 +0200 Subject: [PATCH] fix newlines being stripped from the resulting documents --- transform-documents.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/transform-documents.py b/transform-documents.py index 8e43a33..40efae8 100644 --- a/transform-documents.py +++ b/transform-documents.py @@ -129,16 +129,18 @@ class WikiDumpHandler(xml.sax.ContentHandler): self.currentTag = None def characters(self, content): - if not content.strip(): - return + # Only filter whitespace for ID fields, preserve all content for text if ( self.currentTag == "id" and self.inPage and not self.inRevision and not self.currentPageId ): - self.currentPageId = content.strip() + content_stripped = content.strip() + if content_stripped: # Only process non-empty ID content + self.currentPageId = content_stripped elif self.inText: + # Always append text content, even if it's just whitespace or newlines self.currentText.append(content) async def _process(self, text: str, uid: str):