mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-11-04 09:02:45 +00:00 
			
		
		
		
	fix newlines being stripped from the resulting documents
This commit is contained in:
		@@ -129,16 +129,18 @@ class WikiDumpHandler(xml.sax.ContentHandler):
 | 
				
			|||||||
        self.currentTag = None
 | 
					        self.currentTag = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def characters(self, content):
 | 
					    def characters(self, content):
 | 
				
			||||||
        if not content.strip():
 | 
					        # Only filter whitespace for ID fields, preserve all content for text
 | 
				
			||||||
            return
 | 
					 | 
				
			||||||
        if (
 | 
					        if (
 | 
				
			||||||
            self.currentTag == "id"
 | 
					            self.currentTag == "id"
 | 
				
			||||||
            and self.inPage
 | 
					            and self.inPage
 | 
				
			||||||
            and not self.inRevision
 | 
					            and not self.inRevision
 | 
				
			||||||
            and not self.currentPageId
 | 
					            and not self.currentPageId
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
            self.currentPageId = content.strip()
 | 
					            content_stripped = content.strip()
 | 
				
			||||||
 | 
					            if content_stripped:  # Only process non-empty ID content
 | 
				
			||||||
 | 
					                self.currentPageId = content_stripped
 | 
				
			||||||
        elif self.inText:
 | 
					        elif self.inText:
 | 
				
			||||||
 | 
					            # Always append text content, even if it's just whitespace or newlines
 | 
				
			||||||
            self.currentText.append(content)
 | 
					            self.currentText.append(content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    async def _process(self, text: str, uid: str):
 | 
					    async def _process(self, text: str, uid: str):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user