mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-10-29 22:22:44 +00:00 
			
		
		
		
	fix newlines being stripped from the resulting documents
This commit is contained in:
		| @@ -129,16 +129,18 @@ class WikiDumpHandler(xml.sax.ContentHandler): | ||||
|         self.currentTag = None | ||||
|  | ||||
|     def characters(self, content): | ||||
|         if not content.strip(): | ||||
|             return | ||||
|         # Only filter whitespace for ID fields, preserve all content for text | ||||
|         if ( | ||||
|             self.currentTag == "id" | ||||
|             and self.inPage | ||||
|             and not self.inRevision | ||||
|             and not self.currentPageId | ||||
|         ): | ||||
|             self.currentPageId = content.strip() | ||||
|             content_stripped = content.strip() | ||||
|             if content_stripped:  # Only process non-empty ID content | ||||
|                 self.currentPageId = content_stripped | ||||
|         elif self.inText: | ||||
|             # Always append text content, even if it's just whitespace or newlines | ||||
|             self.currentText.append(content) | ||||
|  | ||||
|     async def _process(self, text: str, uid: str): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Bruce Röttgers
					Bruce Röttgers