mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-10-31 15:12:47 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			43 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			43 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from logging import getLogger
 | |
| import zlib
 | |
| import re
 | |
| import aiohttp
 | |
| 
 | |
| logger = getLogger(__name__)
 | |
| 
 | |
| async def fetch_mappings() -> dict[str, str]:
 | |
|     """
 | |
|     Download and gunzip the page_props SQL dump, extract
 | |
|     page→wikibase_item mappings.
 | |
|     """
 | |
|     sql_url = (
 | |
|         "https://dumps.wikimedia.org/"
 | |
|         "enwikivoyage/latest/"
 | |
|         "enwikivoyage-latest-page_props.sql.gz"
 | |
|     )
 | |
|     # decompress gzip
 | |
|     decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
 | |
|     # regex for tuples: (page,'prop','value',NULL_or_number)
 | |
|     tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
 | |
|     buffer = ""
 | |
|     mappings: dict[str, str] = {}
 | |
| 
 | |
|     async with aiohttp.ClientSession() as session:
 | |
|         async with session.get(sql_url) as resp:
 | |
|             resp.raise_for_status()
 | |
|             async for chunk in resp.content.iter_chunked(1024 * 1024):
 | |
|                 data = decomp.decompress(chunk)
 | |
|                 if not data:
 | |
|                     continue
 | |
|                 text = data.decode("utf-8", errors="ignore")
 | |
|                 buffer += text
 | |
|                 for m in tuple_re.finditer(buffer):
 | |
|                     page_id, prop, value = m.group(1), m.group(2), m.group(3)
 | |
|                     if prop == "wikibase_item":
 | |
|                         logger.debug(f"Found mapping {page_id} -> {value}")
 | |
|                         mappings[page_id] = value
 | |
|                 # keep tail to handle split tuples
 | |
|                 if len(buffer) > 1000:
 | |
|                     buffer = buffer[-1000:]
 | |
|     return mappings
 | 
