mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-11-04 09:02:45 +00:00 
			
		
		
		
	fix newlines being stripped from the resulting documents
This commit is contained in:
		@@ -129,16 +129,18 @@ class WikiDumpHandler(xml.sax.ContentHandler):
 | 
			
		||||
        self.currentTag = None
 | 
			
		||||
 | 
			
		||||
    def characters(self, content):
 | 
			
		||||
        if not content.strip():
 | 
			
		||||
            return
 | 
			
		||||
        # Only filter whitespace for ID fields, preserve all content for text
 | 
			
		||||
        if (
 | 
			
		||||
            self.currentTag == "id"
 | 
			
		||||
            and self.inPage
 | 
			
		||||
            and not self.inRevision
 | 
			
		||||
            and not self.currentPageId
 | 
			
		||||
        ):
 | 
			
		||||
            self.currentPageId = content.strip()
 | 
			
		||||
            content_stripped = content.strip()
 | 
			
		||||
            if content_stripped:  # Only process non-empty ID content
 | 
			
		||||
                self.currentPageId = content_stripped
 | 
			
		||||
        elif self.inText:
 | 
			
		||||
            # Always append text content, even if it's just whitespace or newlines
 | 
			
		||||
            self.currentText.append(content)
 | 
			
		||||
 | 
			
		||||
    async def _process(self, text: str, uid: str):
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user