2025-04-30 14:01:20 +02:00

43 lines
1.5 KiB
Python

from logging import getLogger
import zlib
import re
import aiohttp
logger = getLogger(__name__)
async def fetch_mappings() -> dict[str, str]:
"""
Download and gunzip the page_props SQL dump, extract
page→wikibase_item mappings.
"""
sql_url = (
"https://dumps.wikimedia.org/"
"enwikivoyage/latest/"
"enwikivoyage-latest-page_props.sql.gz"
)
# decompress gzip
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
# regex for tuples: (page,'prop','value',NULL_or_number)
tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
buffer = ""
mappings: dict[str, str] = {}
async with aiohttp.ClientSession() as session:
async with session.get(sql_url) as resp:
resp.raise_for_status()
async for chunk in resp.content.iter_chunked(1024 * 1024):
data = decomp.decompress(chunk)
if not data:
continue
text = data.decode("utf-8", errors="ignore")
buffer += text
for m in tuple_re.finditer(buffer):
page_id, prop, value = m.group(1), m.group(2), m.group(3)
if prop == "wikibase_item":
logger.debug(f"Found mapping {page_id} -> {value}")
mappings[page_id] = value
# keep tail to handle split tuples
if len(buffer) > 1000:
buffer = buffer[-1000:]
return mappings