mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-07-14 18:44:06 +00:00
add bounding, env var config
This commit is contained in:
parent
3bfd30a073
commit
e0aa134ecc
1
.gitignore
vendored
1
.gitignore
vendored
@ -11,3 +11,4 @@ wheels/
|
||||
|
||||
.env
|
||||
node_modules
|
||||
output
|
@ -1,2 +1,2 @@
|
||||
from .base_handler import BaseHandler
|
||||
from .filesystm_handler import FileSystemHandler
|
||||
from .filesystem import FilesystemHandler
|
@ -2,8 +2,9 @@
|
||||
from pathlib import Path
|
||||
import aiofiles
|
||||
from .base_handler import BaseHandler
|
||||
import json
|
||||
|
||||
class FileSystemHandler(BaseHandler):
|
||||
class FilesystemHandler(BaseHandler):
|
||||
"""
|
||||
Handler that writes files to the filesystem.
|
||||
"""
|
||||
@ -36,7 +37,7 @@ class FileSystemHandler(BaseHandler):
|
||||
try:
|
||||
file_path = self.output_dir / f"{uid}.json"
|
||||
async with aiofiles.open(file_path, 'w') as f:
|
||||
await f.write(entry)
|
||||
await f.write(json.dumps(entry))
|
||||
return True
|
||||
except IOError as e:
|
||||
self.logger.error(f"Error writing entry {uid}: {e}")
|
@ -7,5 +7,6 @@ requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"aiofiles>=24.1.0",
|
||||
"mwparserfromhell>=0.6.6",
|
||||
"python-dotenv>=1.1.0",
|
||||
"wikitextparser>=0.56.3",
|
||||
]
|
||||
|
@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import asyncio
|
||||
import importlib
|
||||
import logging
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
class WikivoyageParser:
|
||||
def __init__(self):
|
||||
@ -342,41 +347,120 @@ class WikivoyageParser:
|
||||
|
||||
return json.dumps(root, indent=indent)
|
||||
|
||||
def process_file(input_file: Path, parser: WikivoyageParser) -> None:
|
||||
"""Process a single wiki file and save JSON output"""
|
||||
# Create output path with .json extension
|
||||
output_file = input_file.with_suffix('.json')
|
||||
async def process_file(
|
||||
input_file: Path,
|
||||
parser: WikivoyageParser,
|
||||
handler,
|
||||
) -> None:
|
||||
"""
|
||||
Parse one wiki file and hand the resulting entry off to our handler.
|
||||
Uses the filename (sans suffix) as the unique UID.
|
||||
"""
|
||||
|
||||
# Ensure output directory exists
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Read and parse input file
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
wikitext = f.read()
|
||||
|
||||
result = parser.parse(wikitext)
|
||||
|
||||
# Write JSON output
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(parser.export_json())
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {input_file}: {e}")
|
||||
text = input_file.read_text(encoding="utf-8")
|
||||
entry = parser.parse(text) # assume returns a dict
|
||||
uid = input_file.stem
|
||||
|
||||
def main():
|
||||
# Initialize parser once for reuse
|
||||
await handler.write_entry(entry, uid)
|
||||
|
||||
def gather_handler_kwargs(handler_name: str) -> dict:
|
||||
"""
|
||||
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
|
||||
E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
|
||||
"""
|
||||
prefix = f"HANDLER_{handler_name.upper()}_"
|
||||
kwargs = {}
|
||||
|
||||
for env_key, val in os.environ.items():
|
||||
if not env_key.startswith(prefix):
|
||||
continue
|
||||
param = env_key[len(prefix) :].lower()
|
||||
# try to cast ints
|
||||
if val.isdigit():
|
||||
val = int(val)
|
||||
kwargs[param] = val
|
||||
|
||||
return kwargs
|
||||
|
||||
async def main():
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
# 1. Which handler to load?
|
||||
handler_name = os.getenv("HANDLER")
|
||||
if not handler_name:
|
||||
print("Error: set ENV HANDLER (e.g. 'filesystem')")
|
||||
sys.exit(1)
|
||||
|
||||
# 2. Dynamic import
|
||||
module_path = f"output_handlers.{handler_name}"
|
||||
try:
|
||||
mod = importlib.import_module(module_path)
|
||||
except ImportError as e:
|
||||
print(f"Error loading handler module {module_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# 3. Find the class: e.g. "sftp" → "SftpHandler"
|
||||
class_name = handler_name.title().replace("_", "") + "Handler"
|
||||
if not hasattr(mod, class_name):
|
||||
print(f"{module_path} defines no class {class_name}")
|
||||
sys.exit(1)
|
||||
HandlerCls = getattr(mod, class_name)
|
||||
|
||||
# 4. Build kwargs from ENV
|
||||
handler_kwargs = gather_handler_kwargs(handler_name)
|
||||
|
||||
# 5. Instantiate
|
||||
handler = HandlerCls(**handler_kwargs)
|
||||
|
||||
# 6. Prepare parser
|
||||
parser = WikivoyageParser()
|
||||
|
||||
# Get input directory from command line or use current directory
|
||||
input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.')
|
||||
|
||||
# Process all .txt files recursively
|
||||
for txt_file in input_dir.rglob('*.txt'):
|
||||
print(f"Processing {txt_file}")
|
||||
process_file(txt_file, parser)
|
||||
|
||||
print("Processing complete")
|
||||
|
||||
# 7. Which dir to walk?
|
||||
input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
|
||||
txt_files = list(input_dir.rglob("*.txt"))
|
||||
|
||||
if not txt_files:
|
||||
print(f"No .txt files found under {input_dir}")
|
||||
else:
|
||||
for txt in txt_files:
|
||||
await process_file(txt, parser, handler)
|
||||
|
||||
# 7) read concurrency setting
|
||||
try:
|
||||
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
||||
except ValueError:
|
||||
print("Error: MAX_CONCURRENT must be an integer")
|
||||
sys.exit(1)
|
||||
|
||||
if max_conc < 0:
|
||||
print("Error: MAX_CONCURRENT must be >= 0")
|
||||
sys.exit(1)
|
||||
|
||||
# 8) schedule tasks
|
||||
if max_conc == 0:
|
||||
# unbounded
|
||||
tasks = [
|
||||
asyncio.create_task(process_file(txt, parser, handler))
|
||||
for txt in txt_files
|
||||
]
|
||||
else:
|
||||
# bounded by semaphore
|
||||
sem = asyncio.Semaphore(max_conc)
|
||||
|
||||
async def bounded(txt):
|
||||
async with sem:
|
||||
return await process_file(txt, parser, handler)
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(bounded(txt))
|
||||
for txt in txt_files
|
||||
]
|
||||
|
||||
# 9) run them all
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
print("All done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
asyncio.run(main())
|
12
uv.lock
generated
12
uv.lock
generated
@ -1,5 +1,4 @@
|
||||
version = 1
|
||||
revision = 1
|
||||
requires-python = ">=3.12"
|
||||
|
||||
[[package]]
|
||||
@ -18,6 +17,7 @@ source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "aiofiles" },
|
||||
{ name = "mwparserfromhell" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "wikitextparser" },
|
||||
]
|
||||
|
||||
@ -25,6 +25,7 @@ dependencies = [
|
||||
requires-dist = [
|
||||
{ name = "aiofiles", specifier = ">=24.1.0" },
|
||||
{ name = "mwparserfromhell", specifier = ">=0.6.6" },
|
||||
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
||||
{ name = "wikitextparser", specifier = ">=0.56.3" },
|
||||
]
|
||||
|
||||
@ -41,6 +42,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dotenv"
|
||||
version = "1.1.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "2024.11.6"
|
||||
|
Loading…
x
Reference in New Issue
Block a user