fix newlines being stripped from the resulting documents

This commit is contained in:
Bruce Röttgers 2025-04-30 13:32:52 +02:00
parent 804ffeb82b
commit 3330667fa7

View File

@ -129,16 +129,18 @@ class WikiDumpHandler(xml.sax.ContentHandler):
self.currentTag = None self.currentTag = None
def characters(self, content): def characters(self, content):
if not content.strip(): # Only filter whitespace for ID fields, preserve all content for text
return
if ( if (
self.currentTag == "id" self.currentTag == "id"
and self.inPage and self.inPage
and not self.inRevision and not self.inRevision
and not self.currentPageId and not self.currentPageId
): ):
self.currentPageId = content.strip() content_stripped = content.strip()
if content_stripped: # Only process non-empty ID content
self.currentPageId = content_stripped
elif self.inText: elif self.inText:
# Always append text content, even if it's just whitespace or newlines
self.currentText.append(content) self.currentText.append(content)
async def _process(self, text: str, uid: str): async def _process(self, text: str, uid: str):