From 3330667fa7371bcfe8c6a3908b3cae2e578ea128 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bruce=20R=C3=B6ttgers?= <hi@bruceroettgers.eu>
Date: Wed, 30 Apr 2025 13:32:52 +0200
Subject: [PATCH] fix newlines being stripped from the resulting documents

---
 transform-documents.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/transform-documents.py b/transform-documents.py
index 8e43a33..40efae8 100644
--- a/transform-documents.py
+++ b/transform-documents.py
@@ -129,16 +129,18 @@ class WikiDumpHandler(xml.sax.ContentHandler):
         self.currentTag = None
 
     def characters(self, content):
-        if not content.strip():
-            return
+        # Only filter whitespace for ID fields, preserve all content for text
         if (
             self.currentTag == "id"
             and self.inPage
             and not self.inRevision
             and not self.currentPageId
         ):
-            self.currentPageId = content.strip()
+            content_stripped = content.strip()
+            if content_stripped:  # Only process non-empty ID content
+                self.currentPageId = content_stripped
         elif self.inText:
+            # Always append text content, even if it's just whitespace or newlines
             self.currentText.append(content)
 
     async def _process(self, text: str, uid: str):