mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-06-07 16:34:04 +00:00
Merge pull request #18 from bcye/feature/only-python
Integrate Node Script into Python
This commit is contained in:
commit
729d4adc62
@ -1,30 +1,21 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
import zlib
|
||||||
|
import bz2
|
||||||
import asyncio
|
import asyncio
|
||||||
import importlib
|
|
||||||
import logging
|
import logging
|
||||||
|
import importlib
|
||||||
|
import xml.sax
|
||||||
|
from pathlib import Path
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from parser import WikivoyageParser
|
import aiohttp
|
||||||
|
from transformers import fetch_mappings, WikiDumpHandler, WikivoyageParser
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
async def process_file(
|
|
||||||
input_file: Path,
|
|
||||||
handler,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Parse one wiki file and hand the resulting entry off to our handler.
|
|
||||||
Uses the filename (sans suffix) as the unique UID.
|
|
||||||
"""
|
|
||||||
|
|
||||||
text = input_file.read_text(encoding="utf-8")
|
|
||||||
parser = WikivoyageParser()
|
|
||||||
entry = parser.parse(text) # assume returns a dict
|
|
||||||
uid = input_file.stem
|
|
||||||
|
|
||||||
await handler.write_entry(entry, uid)
|
|
||||||
|
|
||||||
def gather_handler_kwargs(handler_name: str) -> dict:
|
def gather_handler_kwargs(handler_name: str) -> dict:
|
||||||
"""
|
"""
|
||||||
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
|
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
|
||||||
@ -47,12 +38,41 @@ def gather_handler_kwargs(handler_name: str) -> dict:
|
|||||||
logger.debug(f"Handler kwargs: {kwargs}")
|
logger.debug(f"Handler kwargs: {kwargs}")
|
||||||
return kwargs
|
return kwargs
|
||||||
|
|
||||||
async def main():
|
|
||||||
|
|
||||||
|
async def process_dump(
|
||||||
|
mappings: dict[str, str], handler, max_concurrent: int
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Stream-download the bzip2-compressed XML dump and feed to SAX.
|
||||||
|
"""
|
||||||
|
xml_url = (
|
||||||
|
"https://dumps.wikimedia.org/"
|
||||||
|
"enwikivoyage/latest/"
|
||||||
|
"enwikivoyage-latest-pages-articles.xml.bz2"
|
||||||
|
)
|
||||||
|
decomp = bz2.BZ2Decompressor()
|
||||||
|
sax_parser = xml.sax.make_parser()
|
||||||
|
dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
|
||||||
|
sax_parser.setContentHandler(dump_handler)
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(xml_url) as resp:
|
||||||
|
resp.raise_for_status()
|
||||||
|
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||||
|
data = decomp.decompress(chunk)
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
text = data.decode("utf-8", errors="ignore")
|
||||||
|
sax_parser.feed(text)
|
||||||
|
sax_parser.close()
|
||||||
|
if dump_handler.tasks:
|
||||||
|
await asyncio.gather(*dump_handler.tasks)
|
||||||
|
|
||||||
|
async def main():
|
||||||
# 1. Which handler to load?
|
# 1. Which handler to load?
|
||||||
handler_name = os.getenv("HANDLER")
|
handler_name = os.getenv("HANDLER")
|
||||||
if not handler_name:
|
if not handler_name:
|
||||||
print("Error: set ENV HANDLER (e.g. 'filesystem')")
|
logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# 2. Dynamic import
|
# 2. Dynamic import
|
||||||
@ -60,31 +80,25 @@ async def main():
|
|||||||
try:
|
try:
|
||||||
mod = importlib.import_module(module_path)
|
mod = importlib.import_module(module_path)
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
print(f"Error loading handler module {module_path}: {e}")
|
logger.error(f"Error loading handler module {module_path}: {e}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# 3. Find the class: e.g. "sftp" → "SftpHandler"
|
# 3. Find the class: e.g. "sftp" → "SftpHandler"
|
||||||
class_name = handler_name.title().replace("_", "") + "Handler"
|
class_name = handler_name.title().replace("_", "") + "Handler"
|
||||||
if not hasattr(mod, class_name):
|
if not hasattr(mod, class_name):
|
||||||
print(f"{module_path} defines no class {class_name}")
|
logger.error(f"{module_path} defines no class {class_name}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
HandlerCls = getattr(mod, class_name)
|
HandlerCls = getattr(mod, class_name)
|
||||||
|
|
||||||
|
logger.info(f"Using handler from {module_path}")
|
||||||
|
|
||||||
# 4. Build kwargs from ENV
|
# 4. Build kwargs from ENV
|
||||||
handler_kwargs = gather_handler_kwargs(handler_name)
|
handler_kwargs = gather_handler_kwargs(handler_name)
|
||||||
|
|
||||||
# 5. Instantiate
|
# 5. Instantiate
|
||||||
handler = HandlerCls(**handler_kwargs)
|
handler = HandlerCls(**handler_kwargs)
|
||||||
|
|
||||||
# 6. Which dir to walk?
|
# 6. read concurrency setting
|
||||||
input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
|
|
||||||
txt_files = list(input_dir.rglob("*.txt"))
|
|
||||||
|
|
||||||
if not txt_files:
|
|
||||||
logger.info(f"No .txt files found under {input_dir}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# 7. read concurrency setting
|
|
||||||
try:
|
try:
|
||||||
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@ -93,31 +107,18 @@ async def main():
|
|||||||
if max_conc < 0:
|
if max_conc < 0:
|
||||||
raise ValueError("MAX_CONCURRENT must be >= 0")
|
raise ValueError("MAX_CONCURRENT must be >= 0")
|
||||||
|
|
||||||
# 8. schedule tasks
|
|
||||||
if max_conc == 0:
|
|
||||||
# unbounded
|
|
||||||
tasks = [
|
|
||||||
asyncio.create_task(process_file(txt, handler))
|
|
||||||
for txt in txt_files
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
# bounded by semaphore
|
|
||||||
sem = asyncio.Semaphore(max_conc)
|
|
||||||
|
|
||||||
async def bounded(txt):
|
# 7. Fetch mappings
|
||||||
async with sem:
|
logger.info("Fetching mappings from SQL dump…")
|
||||||
return await process_file(txt, handler)
|
mappings = await fetch_mappings()
|
||||||
|
logger.info(f"Got {len(mappings)} wikibase_item mappings.")
|
||||||
|
|
||||||
tasks = [
|
# 8. Stream & split the XML dump
|
||||||
asyncio.create_task(bounded(txt))
|
logger.info("Processing XML dump…")
|
||||||
for txt in txt_files
|
await process_dump(mappings, handler, max_conc)
|
||||||
]
|
|
||||||
|
|
||||||
# 9. run them all
|
# 5. Finish up
|
||||||
await asyncio.gather(*tasks)
|
|
||||||
await handler.close()
|
await handler.close()
|
||||||
|
|
||||||
|
|
||||||
logger.info("All done.")
|
logger.info("All done.")
|
||||||
|
|
||||||
|
|
183
package-lock.json
generated
183
package-lock.json
generated
@ -1,183 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "mapvoyage-extract",
|
|
||||||
"lockfileVersion": 3,
|
|
||||||
"requires": true,
|
|
||||||
"packages": {
|
|
||||||
"": {
|
|
||||||
"dependencies": {
|
|
||||||
"sax": "^1.4.1",
|
|
||||||
"unbzip2-stream": "^1.4.3"
|
|
||||||
},
|
|
||||||
"devDependencies": {
|
|
||||||
"@types/node": "^22.14.0",
|
|
||||||
"@types/sax": "^1.2.7",
|
|
||||||
"@types/unbzip2-stream": "^1.4.3",
|
|
||||||
"prettier": "^3.4.2",
|
|
||||||
"typescript": "^5.8.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/@types/node": {
|
|
||||||
"version": "22.14.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.14.0.tgz",
|
|
||||||
"integrity": "sha512-Kmpl+z84ILoG+3T/zQFyAJsU6EPTmOCj8/2+83fSN6djd6I4o7uOuGIH6vq3PrjY5BGitSbFuMN18j3iknubbA==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "MIT",
|
|
||||||
"dependencies": {
|
|
||||||
"undici-types": "~6.21.0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/@types/sax": {
|
|
||||||
"version": "1.2.7",
|
|
||||||
"resolved": "https://registry.npmjs.org/@types/sax/-/sax-1.2.7.tgz",
|
|
||||||
"integrity": "sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "MIT",
|
|
||||||
"dependencies": {
|
|
||||||
"@types/node": "*"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/@types/through": {
|
|
||||||
"version": "0.0.33",
|
|
||||||
"resolved": "https://registry.npmjs.org/@types/through/-/through-0.0.33.tgz",
|
|
||||||
"integrity": "sha512-HsJ+z3QuETzP3cswwtzt2vEIiHBk/dCcHGhbmG5X3ecnwFD/lPrMpliGXxSCg03L9AhrdwA4Oz/qfspkDW+xGQ==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "MIT",
|
|
||||||
"dependencies": {
|
|
||||||
"@types/node": "*"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/@types/unbzip2-stream": {
|
|
||||||
"version": "1.4.3",
|
|
||||||
"resolved": "https://registry.npmjs.org/@types/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz",
|
|
||||||
"integrity": "sha512-D8X5uuJRISqc8YtwL8jNW2FpPdUOCYXbfD6zNROCTbVXK9nawucxh10tVXE3MPjnHdRA1LvB0zDxVya/lBsnYw==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "MIT",
|
|
||||||
"dependencies": {
|
|
||||||
"@types/through": "*"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/base64-js": {
|
|
||||||
"version": "1.5.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
|
|
||||||
"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
|
|
||||||
"funding": [
|
|
||||||
{
|
|
||||||
"type": "github",
|
|
||||||
"url": "https://github.com/sponsors/feross"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "patreon",
|
|
||||||
"url": "https://www.patreon.com/feross"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "consulting",
|
|
||||||
"url": "https://feross.org/support"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"license": "MIT"
|
|
||||||
},
|
|
||||||
"node_modules/buffer": {
|
|
||||||
"version": "5.7.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
|
|
||||||
"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
|
|
||||||
"funding": [
|
|
||||||
{
|
|
||||||
"type": "github",
|
|
||||||
"url": "https://github.com/sponsors/feross"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "patreon",
|
|
||||||
"url": "https://www.patreon.com/feross"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "consulting",
|
|
||||||
"url": "https://feross.org/support"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"license": "MIT",
|
|
||||||
"dependencies": {
|
|
||||||
"base64-js": "^1.3.1",
|
|
||||||
"ieee754": "^1.1.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/ieee754": {
|
|
||||||
"version": "1.2.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
|
|
||||||
"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
|
|
||||||
"funding": [
|
|
||||||
{
|
|
||||||
"type": "github",
|
|
||||||
"url": "https://github.com/sponsors/feross"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "patreon",
|
|
||||||
"url": "https://www.patreon.com/feross"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "consulting",
|
|
||||||
"url": "https://feross.org/support"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"license": "BSD-3-Clause"
|
|
||||||
},
|
|
||||||
"node_modules/prettier": {
|
|
||||||
"version": "3.5.3",
|
|
||||||
"resolved": "https://registry.npmjs.org/prettier/-/prettier-3.5.3.tgz",
|
|
||||||
"integrity": "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "MIT",
|
|
||||||
"bin": {
|
|
||||||
"prettier": "bin/prettier.cjs"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=14"
|
|
||||||
},
|
|
||||||
"funding": {
|
|
||||||
"url": "https://github.com/prettier/prettier?sponsor=1"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/sax": {
|
|
||||||
"version": "1.4.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz",
|
|
||||||
"integrity": "sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==",
|
|
||||||
"license": "ISC"
|
|
||||||
},
|
|
||||||
"node_modules/through": {
|
|
||||||
"version": "2.3.8",
|
|
||||||
"resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
|
|
||||||
"integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==",
|
|
||||||
"license": "MIT"
|
|
||||||
},
|
|
||||||
"node_modules/typescript": {
|
|
||||||
"version": "5.8.2",
|
|
||||||
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.2.tgz",
|
|
||||||
"integrity": "sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"bin": {
|
|
||||||
"tsc": "bin/tsc",
|
|
||||||
"tsserver": "bin/tsserver"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=14.17"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/unbzip2-stream": {
|
|
||||||
"version": "1.4.3",
|
|
||||||
"resolved": "https://registry.npmjs.org/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz",
|
|
||||||
"integrity": "sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==",
|
|
||||||
"license": "MIT",
|
|
||||||
"dependencies": {
|
|
||||||
"buffer": "^5.2.1",
|
|
||||||
"through": "^2.3.8"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/undici-types": {
|
|
||||||
"version": "6.21.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
|
|
||||||
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "MIT"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
14
package.json
14
package.json
@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"private": true,
|
|
||||||
"devDependencies": {
|
|
||||||
"@types/node": "^22.14.0",
|
|
||||||
"@types/sax": "^1.2.7",
|
|
||||||
"@types/unbzip2-stream": "^1.4.3",
|
|
||||||
"prettier": "^3.4.2",
|
|
||||||
"typescript": "^5.8.2"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"sax": "^1.4.1",
|
|
||||||
"unbzip2-stream": "^1.4.3"
|
|
||||||
}
|
|
||||||
}
|
|
192
split-dump.ts
192
split-dump.ts
@ -1,192 +0,0 @@
|
|||||||
import fs from "fs";
|
|
||||||
import https from "https";
|
|
||||||
import path from "path";
|
|
||||||
import sax from "sax";
|
|
||||||
import bz2 from "unbzip2-stream";
|
|
||||||
import { createGunzip } from "zlib";
|
|
||||||
|
|
||||||
// Local storage configuration
|
|
||||||
const OUTPUT_FOLDER = "myfolder";
|
|
||||||
|
|
||||||
// --- Step 1: Fetch mappings from SQL dump ---
|
|
||||||
async function fetchMappings(): Promise<Record<string, string>> {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
const sqlUrl =
|
|
||||||
"https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-page_props.sql.gz";
|
|
||||||
https
|
|
||||||
.get(sqlUrl, (res) => {
|
|
||||||
if (res.statusCode !== 200) {
|
|
||||||
return reject(
|
|
||||||
new Error(`Failed to get SQL dump, status code: ${res.statusCode}`),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const gunzip = createGunzip();
|
|
||||||
let buffer = "";
|
|
||||||
const mappings: Record<string, string> = {};
|
|
||||||
res.pipe(gunzip);
|
|
||||||
gunzip.on("data", (chunk: Buffer) => {
|
|
||||||
buffer += chunk.toString();
|
|
||||||
const regex = /\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)/g;
|
|
||||||
let match: RegExpExecArray | null;
|
|
||||||
while ((match = regex.exec(buffer)) !== null) {
|
|
||||||
const [, pp_page, pp_propname, pp_value] = match;
|
|
||||||
if (pp_propname === "wikibase_item") {
|
|
||||||
mappings[pp_page] = pp_value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Keep a tail to handle chunk splits
|
|
||||||
if (buffer.length > 1000) {
|
|
||||||
buffer = buffer.slice(-1000);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
gunzip.on("end", () => resolve(mappings));
|
|
||||||
gunzip.on("error", reject);
|
|
||||||
})
|
|
||||||
.on("error", reject);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Helper to save file locally ---
|
|
||||||
let saveCount = 0;
|
|
||||||
function saveToLocalFile(filename: string, data: string): Promise<void> {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
// Create directory if it doesn't exist
|
|
||||||
if (!fs.existsSync(OUTPUT_FOLDER)) {
|
|
||||||
fs.mkdirSync(OUTPUT_FOLDER, { recursive: true });
|
|
||||||
}
|
|
||||||
|
|
||||||
const filePath = path.join(OUTPUT_FOLDER, filename);
|
|
||||||
fs.writeFile(filePath, data, (err) => {
|
|
||||||
if (err) {
|
|
||||||
reject(err);
|
|
||||||
} else {
|
|
||||||
console.log(`File saved successfully (${++saveCount}): ${filePath}`);
|
|
||||||
resolve();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Simple semaphore to limit concurrency
|
|
||||||
class Semaphore {
|
|
||||||
private tasks: (() => void)[] = [];
|
|
||||||
private count: number;
|
|
||||||
constructor(count: number) {
|
|
||||||
this.count = count;
|
|
||||||
}
|
|
||||||
async acquire(): Promise<() => void> {
|
|
||||||
return new Promise((release) => {
|
|
||||||
const task = () => {
|
|
||||||
this.count--;
|
|
||||||
release(() => {
|
|
||||||
this.count++;
|
|
||||||
if (this.tasks.length > 0) {
|
|
||||||
const next = this.tasks.shift()!;
|
|
||||||
next();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
};
|
|
||||||
if (this.count > 0) {
|
|
||||||
task();
|
|
||||||
} else {
|
|
||||||
this.tasks.push(task);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Step 3: Process the XML dump ---
|
|
||||||
async function processXML(mappings: Record<string, string>): Promise<void> {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
const xmlUrl =
|
|
||||||
"https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-pages-articles.xml.bz2";
|
|
||||||
https
|
|
||||||
.get(xmlUrl, (res) => {
|
|
||||||
if (res.statusCode !== 200) {
|
|
||||||
return reject(
|
|
||||||
new Error(`Failed to fetch XML dump: ${res.statusCode}`),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// Pipe through bz2 decompressor
|
|
||||||
const stream = res.pipe(bz2());
|
|
||||||
// Use sax for streaming XML parsing
|
|
||||||
const parser = sax.createStream(true, {});
|
|
||||||
let currentPageId: string | null = null;
|
|
||||||
let currentText: string | null = null;
|
|
||||||
let inPage = false;
|
|
||||||
let inRevision = false;
|
|
||||||
let inText = false;
|
|
||||||
let currentTag: string | null = null; // Track current tag
|
|
||||||
parser.on("opentag", (node) => {
|
|
||||||
currentTag = node.name; // Track current tag
|
|
||||||
if (node.name === "page") {
|
|
||||||
inPage = true;
|
|
||||||
currentPageId = null;
|
|
||||||
currentText = null;
|
|
||||||
} else if (node.name === "revision") {
|
|
||||||
inRevision = true;
|
|
||||||
} else if (inRevision && node.name === "text") {
|
|
||||||
inText = true;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
parser.on("closetag", (tagName) => {
|
|
||||||
if (tagName === "page") {
|
|
||||||
if (
|
|
||||||
typeof currentPageId == "string" &&
|
|
||||||
currentText !== null &&
|
|
||||||
!!mappings[currentPageId]
|
|
||||||
) {
|
|
||||||
const wikidataId = mappings[currentPageId];
|
|
||||||
const filename = `${wikidataId}.wiki.txt`;
|
|
||||||
|
|
||||||
// Make a copy as the value will continue changing
|
|
||||||
const textToSave = currentText.toString();
|
|
||||||
|
|
||||||
|
|
||||||
saveToLocalFile(filename, textToSave).catch((err) =>
|
|
||||||
console.error(`Save error for page ${currentPageId}:`, err)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// Reset state for the next page
|
|
||||||
inPage = false;
|
|
||||||
currentPageId = null;
|
|
||||||
currentText = null;
|
|
||||||
} else if (tagName === "revision") {
|
|
||||||
inRevision = false;
|
|
||||||
} else if (tagName === "text") {
|
|
||||||
inText = false;
|
|
||||||
}
|
|
||||||
currentTag = null; // Reset current tag
|
|
||||||
});
|
|
||||||
parser.on("text", (text) => {
|
|
||||||
const trimmedText = text.trim();
|
|
||||||
if (!trimmedText) return;
|
|
||||||
if (currentTag === "id" && inPage && !inRevision && !currentPageId) {
|
|
||||||
currentPageId = trimmedText;
|
|
||||||
} else if (inText) {
|
|
||||||
currentText = (currentText || "") + trimmedText;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
parser.on("error", reject);
|
|
||||||
parser.on("end", resolve);
|
|
||||||
stream.pipe(parser);
|
|
||||||
})
|
|
||||||
.on("error", reject);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Main integration ---
|
|
||||||
async function main() {
|
|
||||||
try {
|
|
||||||
console.log("Fetching mappings from SQL dump...");
|
|
||||||
const mappings = await fetchMappings();
|
|
||||||
console.log(`Fetched ${Object.keys(mappings).length} mappings.`);
|
|
||||||
console.log("Processing XML dump...");
|
|
||||||
await processXML(mappings);
|
|
||||||
console.log("Processing complete.");
|
|
||||||
} catch (err) {
|
|
||||||
console.error("Error:", err);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
main().then(() => process.exit());
|
|
3
transformers/__init__.py
Normal file
3
transformers/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from .fetch_mappings import fetch_mappings
|
||||||
|
from .wiki_dump_handler import WikiDumpHandler
|
||||||
|
from .parser import WikivoyageParser
|
42
transformers/fetch_mappings.py
Normal file
42
transformers/fetch_mappings.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from logging import getLogger
|
||||||
|
import zlib
|
||||||
|
import re
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
async def fetch_mappings() -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Download and gunzip the page_props SQL dump, extract
|
||||||
|
page→wikibase_item mappings.
|
||||||
|
"""
|
||||||
|
sql_url = (
|
||||||
|
"https://dumps.wikimedia.org/"
|
||||||
|
"enwikivoyage/latest/"
|
||||||
|
"enwikivoyage-latest-page_props.sql.gz"
|
||||||
|
)
|
||||||
|
# decompress gzip
|
||||||
|
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||||
|
# regex for tuples: (page,'prop','value',NULL_or_number)
|
||||||
|
tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
|
||||||
|
buffer = ""
|
||||||
|
mappings: dict[str, str] = {}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(sql_url) as resp:
|
||||||
|
resp.raise_for_status()
|
||||||
|
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||||
|
data = decomp.decompress(chunk)
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
text = data.decode("utf-8", errors="ignore")
|
||||||
|
buffer += text
|
||||||
|
for m in tuple_re.finditer(buffer):
|
||||||
|
page_id, prop, value = m.group(1), m.group(2), m.group(3)
|
||||||
|
if prop == "wikibase_item":
|
||||||
|
logger.debug(f"Found mapping {page_id} -> {value}")
|
||||||
|
mappings[page_id] = value
|
||||||
|
# keep tail to handle split tuples
|
||||||
|
if len(buffer) > 1000:
|
||||||
|
buffer = buffer[-1000:]
|
||||||
|
return mappings
|
96
transformers/wiki_dump_handler.py
Normal file
96
transformers/wiki_dump_handler.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
from logging import getLogger
|
||||||
|
import xml.sax
|
||||||
|
import asyncio
|
||||||
|
from .parser import WikivoyageParser
|
||||||
|
|
||||||
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
class WikiDumpHandler(xml.sax.ContentHandler):
|
||||||
|
"""
|
||||||
|
SAX handler that, for each <page> whose <id> is in mappings,
|
||||||
|
collects the <text> and schedules an async task to parse
|
||||||
|
and write via the user‐supplied handler.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, mappings, handler, max_concurrent):
|
||||||
|
super().__init__()
|
||||||
|
self.mappings = mappings
|
||||||
|
self.handler = handler
|
||||||
|
self.sem = (
|
||||||
|
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
||||||
|
)
|
||||||
|
self.tasks: list[asyncio.Task] = []
|
||||||
|
|
||||||
|
self.currentTag: str | None = None
|
||||||
|
self.inPage = False
|
||||||
|
self.inRevision = False
|
||||||
|
self.inText = False
|
||||||
|
self.currentPageId: str | None = None
|
||||||
|
self.currentText: list[str] = []
|
||||||
|
|
||||||
|
def startElement(self, name, attrs):
|
||||||
|
self.currentTag = name
|
||||||
|
if name == "page":
|
||||||
|
logger.debug("start page")
|
||||||
|
self.inPage = True
|
||||||
|
self.currentPageId = None
|
||||||
|
self.currentText = []
|
||||||
|
elif name == "revision":
|
||||||
|
logger.debug("start revision")
|
||||||
|
self.inRevision = True
|
||||||
|
elif name == "text" and self.inRevision:
|
||||||
|
logger.debug("start text")
|
||||||
|
self.inText = True
|
||||||
|
|
||||||
|
def endElement(self, name):
|
||||||
|
if name == "page":
|
||||||
|
logger.debug("end page")
|
||||||
|
pid = self.currentPageId
|
||||||
|
if pid and pid in self.mappings:
|
||||||
|
wd_id = self.mappings[pid]
|
||||||
|
text = "".join(self.currentText)
|
||||||
|
logger.debug(f"scheduled {wd_id} for handling")
|
||||||
|
# schedule processing
|
||||||
|
if self.sem:
|
||||||
|
task = asyncio.create_task(self._bounded_process(text, wd_id))
|
||||||
|
else:
|
||||||
|
task = asyncio.create_task(self._process(text, wd_id))
|
||||||
|
self.tasks.append(task)
|
||||||
|
else:
|
||||||
|
logger.debug(f"page {pid} without wikidata id, skipping...")
|
||||||
|
# reset
|
||||||
|
self.inPage = self.inRevision = self.inText = False
|
||||||
|
self.currentPageId = None
|
||||||
|
self.currentText = []
|
||||||
|
elif name == "revision":
|
||||||
|
logger.debug("end revision")
|
||||||
|
self.inRevision = False
|
||||||
|
elif name == "text":
|
||||||
|
logger.debug("end text")
|
||||||
|
self.inText = False
|
||||||
|
self.currentTag = None
|
||||||
|
|
||||||
|
def characters(self, content):
|
||||||
|
# Only filter whitespace for ID fields, preserve all content for text
|
||||||
|
if (
|
||||||
|
self.currentTag == "id"
|
||||||
|
and self.inPage
|
||||||
|
and not self.inRevision
|
||||||
|
and not self.currentPageId
|
||||||
|
):
|
||||||
|
content_stripped = content.strip()
|
||||||
|
if content_stripped: # Only process non-empty ID content
|
||||||
|
self.currentPageId = content_stripped
|
||||||
|
elif self.inText:
|
||||||
|
# Always append text content, even if it's just whitespace or newlines
|
||||||
|
self.currentText.append(content)
|
||||||
|
|
||||||
|
async def _process(self, text: str, uid: str):
|
||||||
|
parser = WikivoyageParser()
|
||||||
|
entry = parser.parse(text)
|
||||||
|
await self.handler.write_entry(entry, uid)
|
||||||
|
|
||||||
|
async def _bounded_process(self, text: str, uid: str):
|
||||||
|
# Only run N at once
|
||||||
|
async with self.sem:
|
||||||
|
await self._process(text, uid)
|
Loading…
x
Reference in New Issue
Block a user