mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-06-07 16:34:04 +00:00
support multiple handlers
This commit is contained in:
parent
f73046bd65
commit
60c13fb9ec
70
main.py
70
main.py
@ -40,7 +40,7 @@ def gather_handler_kwargs(handler_name: str) -> dict:
|
|||||||
|
|
||||||
|
|
||||||
async def process_dump(
|
async def process_dump(
|
||||||
mappings: dict[str, str], handler, max_concurrent: int
|
mappings: dict[str, str], handlers, max_concurrent: int
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Stream-download the bzip2-compressed XML dump and feed to SAX.
|
Stream-download the bzip2-compressed XML dump and feed to SAX.
|
||||||
@ -52,7 +52,7 @@ async def process_dump(
|
|||||||
)
|
)
|
||||||
decomp = bz2.BZ2Decompressor()
|
decomp = bz2.BZ2Decompressor()
|
||||||
sax_parser = xml.sax.make_parser()
|
sax_parser = xml.sax.make_parser()
|
||||||
dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
|
dump_handler = WikiDumpHandler(mappings, handlers, max_concurrent)
|
||||||
sax_parser.setContentHandler(dump_handler)
|
sax_parser.setContentHandler(dump_handler)
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
@ -69,36 +69,45 @@ async def process_dump(
|
|||||||
await asyncio.gather(*dump_handler.tasks)
|
await asyncio.gather(*dump_handler.tasks)
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# 1. Which handler to load?
|
# 1. Which handler(s) to load?
|
||||||
handler_name = os.getenv("HANDLER")
|
handler_names = os.getenv("HANDLER", "").split(",")
|
||||||
if not handler_name:
|
if not handler_names or not handler_names[0]:
|
||||||
logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
|
logger.error("Error: set ENV HANDLER (e.g. 'filesystem' or 'filesystem,sftp')")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# 2. Dynamic import
|
handlers = []
|
||||||
module_path = f"output_handlers.{handler_name}"
|
|
||||||
try:
|
# 2. Load each handler
|
||||||
mod = importlib.import_module(module_path)
|
for handler_name in handler_names:
|
||||||
except ImportError as e:
|
handler_name = handler_name.strip()
|
||||||
logger.error(f"Error loading handler module {module_path}: {e}")
|
if not handler_name:
|
||||||
sys.exit(1)
|
continue
|
||||||
|
|
||||||
|
# Dynamic import
|
||||||
|
module_path = f"output_handlers.{handler_name}"
|
||||||
|
try:
|
||||||
|
mod = importlib.import_module(module_path)
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error(f"Error loading handler module {module_path}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# 3. Find the class: e.g. "sftp" → "SftpHandler"
|
# Find the class: e.g. "sftp" → "SftpHandler"
|
||||||
class_name = handler_name.title().replace("_", "") + "Handler"
|
class_name = handler_name.title().replace("_", "") + "Handler"
|
||||||
if not hasattr(mod, class_name):
|
if not hasattr(mod, class_name):
|
||||||
logger.error(f"{module_path} defines no class {class_name}")
|
logger.error(f"{module_path} defines no class {class_name}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
HandlerCls = getattr(mod, class_name)
|
HandlerCls = getattr(mod, class_name)
|
||||||
|
|
||||||
logger.info(f"Using handler from {module_path}")
|
logger.info(f"Using handler from {module_path}")
|
||||||
|
|
||||||
# 4. Build kwargs from ENV
|
# Build kwargs from ENV
|
||||||
handler_kwargs = gather_handler_kwargs(handler_name)
|
handler_kwargs = gather_handler_kwargs(handler_name)
|
||||||
|
|
||||||
# 5. Instantiate
|
# Instantiate
|
||||||
handler = HandlerCls(**handler_kwargs)
|
handler = HandlerCls(**handler_kwargs)
|
||||||
|
handlers.append(handler)
|
||||||
|
|
||||||
# 6. read concurrency setting
|
# 3. read concurrency setting
|
||||||
try:
|
try:
|
||||||
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@ -107,18 +116,17 @@ async def main():
|
|||||||
if max_conc < 0:
|
if max_conc < 0:
|
||||||
raise ValueError("MAX_CONCURRENT must be >= 0")
|
raise ValueError("MAX_CONCURRENT must be >= 0")
|
||||||
|
|
||||||
|
# 4. Fetch mappings
|
||||||
# 7. Fetch mappings
|
|
||||||
logger.info("Fetching mappings from SQL dump…")
|
logger.info("Fetching mappings from SQL dump…")
|
||||||
mappings = await fetch_mappings()
|
mappings = await fetch_mappings()
|
||||||
logger.info(f"Got {len(mappings)} wikibase_item mappings.")
|
logger.info(f"Got {len(mappings)} wikibase_item mappings.")
|
||||||
|
|
||||||
# 8. Stream & split the XML dump
|
# 5. Stream & split the XML dump
|
||||||
logger.info("Processing XML dump…")
|
logger.info("Processing XML dump…")
|
||||||
await process_dump(mappings, handler, max_conc)
|
await process_dump(mappings, handlers, max_conc)
|
||||||
|
|
||||||
# 5. Finish up
|
# 6. Finish up
|
||||||
await handler.close()
|
await asyncio.gather(*[handler.close() for handler in handlers])
|
||||||
logger.info("All done.")
|
logger.info("All done.")
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,13 +9,14 @@ class WikiDumpHandler(xml.sax.ContentHandler):
|
|||||||
"""
|
"""
|
||||||
SAX handler that, for each <page> whose <id> is in mappings,
|
SAX handler that, for each <page> whose <id> is in mappings,
|
||||||
collects the <text> and schedules an async task to parse
|
collects the <text> and schedules an async task to parse
|
||||||
and write via the user‐supplied handler.
|
and write via the user‐supplied handler(s).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, mappings, handler, max_concurrent):
|
def __init__(self, mappings, handlers, max_concurrent):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.mappings = mappings
|
self.mappings = mappings
|
||||||
self.handler = handler
|
# Support a single handler or a list of handlers
|
||||||
|
self.handlers = handlers
|
||||||
self.sem = (
|
self.sem = (
|
||||||
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
||||||
)
|
)
|
||||||
@ -98,7 +99,11 @@ class WikiDumpHandler(xml.sax.ContentHandler):
|
|||||||
parser = WikivoyageParser()
|
parser = WikivoyageParser()
|
||||||
entry = parser.parse(text)
|
entry = parser.parse(text)
|
||||||
entry['properties']['title'] = title
|
entry['properties']['title'] = title
|
||||||
await self.handler.write_entry(entry, uid)
|
|
||||||
|
# Write to all handlers concurrently
|
||||||
|
await asyncio.gather(*[
|
||||||
|
handler.write_entry(entry, uid) for handler in self.handlers
|
||||||
|
])
|
||||||
|
|
||||||
async def _bounded_process(self, text: str, uid: str, title: str):
|
async def _bounded_process(self, text: str, uid: str, title: str):
|
||||||
# Only run N at once
|
# Only run N at once
|
||||||
|
Loading…
x
Reference in New Issue
Block a user