mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-06-07 16:34:04 +00:00
43 lines
1.5 KiB
Python
43 lines
1.5 KiB
Python
from logging import getLogger
|
|
import zlib
|
|
import re
|
|
import aiohttp
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
async def fetch_mappings() -> dict[str, str]:
|
|
"""
|
|
Download and gunzip the page_props SQL dump, extract
|
|
page→wikibase_item mappings.
|
|
"""
|
|
sql_url = (
|
|
"https://dumps.wikimedia.org/"
|
|
"enwikivoyage/latest/"
|
|
"enwikivoyage-latest-page_props.sql.gz"
|
|
)
|
|
# decompress gzip
|
|
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
# regex for tuples: (page,'prop','value',NULL_or_number)
|
|
tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
|
|
buffer = ""
|
|
mappings: dict[str, str] = {}
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(sql_url) as resp:
|
|
resp.raise_for_status()
|
|
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
|
data = decomp.decompress(chunk)
|
|
if not data:
|
|
continue
|
|
text = data.decode("utf-8", errors="ignore")
|
|
buffer += text
|
|
for m in tuple_re.finditer(buffer):
|
|
page_id, prop, value = m.group(1), m.group(2), m.group(3)
|
|
if prop == "wikibase_item":
|
|
logger.debug(f"Found mapping {page_id} -> {value}")
|
|
mappings[page_id] = value
|
|
# keep tail to handle split tuples
|
|
if len(buffer) > 1000:
|
|
buffer = buffer[-1000:]
|
|
return mappings
|