mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-07-15 02:54:05 +00:00
add bounding, env var config
This commit is contained in:
parent
3bfd30a073
commit
e0aa134ecc
1
.gitignore
vendored
1
.gitignore
vendored
@ -11,3 +11,4 @@ wheels/
|
|||||||
|
|
||||||
.env
|
.env
|
||||||
node_modules
|
node_modules
|
||||||
|
output
|
@ -1,2 +1,2 @@
|
|||||||
from .base_handler import BaseHandler
|
from .base_handler import BaseHandler
|
||||||
from .filesystm_handler import FileSystemHandler
|
from .filesystem import FilesystemHandler
|
@ -2,8 +2,9 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import aiofiles
|
import aiofiles
|
||||||
from .base_handler import BaseHandler
|
from .base_handler import BaseHandler
|
||||||
|
import json
|
||||||
|
|
||||||
class FileSystemHandler(BaseHandler):
|
class FilesystemHandler(BaseHandler):
|
||||||
"""
|
"""
|
||||||
Handler that writes files to the filesystem.
|
Handler that writes files to the filesystem.
|
||||||
"""
|
"""
|
||||||
@ -36,7 +37,7 @@ class FileSystemHandler(BaseHandler):
|
|||||||
try:
|
try:
|
||||||
file_path = self.output_dir / f"{uid}.json"
|
file_path = self.output_dir / f"{uid}.json"
|
||||||
async with aiofiles.open(file_path, 'w') as f:
|
async with aiofiles.open(file_path, 'w') as f:
|
||||||
await f.write(entry)
|
await f.write(json.dumps(entry))
|
||||||
return True
|
return True
|
||||||
except IOError as e:
|
except IOError as e:
|
||||||
self.logger.error(f"Error writing entry {uid}: {e}")
|
self.logger.error(f"Error writing entry {uid}: {e}")
|
@ -7,5 +7,6 @@ requires-python = ">=3.12"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"aiofiles>=24.1.0",
|
"aiofiles>=24.1.0",
|
||||||
"mwparserfromhell>=0.6.6",
|
"mwparserfromhell>=0.6.6",
|
||||||
|
"python-dotenv>=1.1.0",
|
||||||
"wikitextparser>=0.56.3",
|
"wikitextparser>=0.56.3",
|
||||||
]
|
]
|
||||||
|
@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
|
import asyncio
|
||||||
|
import importlib
|
||||||
|
import logging
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
class WikivoyageParser:
|
class WikivoyageParser:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -342,41 +347,120 @@ class WikivoyageParser:
|
|||||||
|
|
||||||
return json.dumps(root, indent=indent)
|
return json.dumps(root, indent=indent)
|
||||||
|
|
||||||
def process_file(input_file: Path, parser: WikivoyageParser) -> None:
|
async def process_file(
|
||||||
"""Process a single wiki file and save JSON output"""
|
input_file: Path,
|
||||||
# Create output path with .json extension
|
parser: WikivoyageParser,
|
||||||
output_file = input_file.with_suffix('.json')
|
handler,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Parse one wiki file and hand the resulting entry off to our handler.
|
||||||
|
Uses the filename (sans suffix) as the unique UID.
|
||||||
|
"""
|
||||||
|
|
||||||
# Ensure output directory exists
|
text = input_file.read_text(encoding="utf-8")
|
||||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
entry = parser.parse(text) # assume returns a dict
|
||||||
|
uid = input_file.stem
|
||||||
|
|
||||||
|
await handler.write_entry(entry, uid)
|
||||||
|
|
||||||
|
def gather_handler_kwargs(handler_name: str) -> dict:
|
||||||
|
"""
|
||||||
|
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
|
||||||
|
E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
|
||||||
|
"""
|
||||||
|
prefix = f"HANDLER_{handler_name.upper()}_"
|
||||||
|
kwargs = {}
|
||||||
|
|
||||||
|
for env_key, val in os.environ.items():
|
||||||
|
if not env_key.startswith(prefix):
|
||||||
|
continue
|
||||||
|
param = env_key[len(prefix) :].lower()
|
||||||
|
# try to cast ints
|
||||||
|
if val.isdigit():
|
||||||
|
val = int(val)
|
||||||
|
kwargs[param] = val
|
||||||
|
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
# 1. Which handler to load?
|
||||||
|
handler_name = os.getenv("HANDLER")
|
||||||
|
if not handler_name:
|
||||||
|
print("Error: set ENV HANDLER (e.g. 'filesystem')")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 2. Dynamic import
|
||||||
|
module_path = f"output_handlers.{handler_name}"
|
||||||
try:
|
try:
|
||||||
# Read and parse input file
|
mod = importlib.import_module(module_path)
|
||||||
with open(input_file, 'r', encoding='utf-8') as f:
|
except ImportError as e:
|
||||||
wikitext = f.read()
|
print(f"Error loading handler module {module_path}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
result = parser.parse(wikitext)
|
# 3. Find the class: e.g. "sftp" → "SftpHandler"
|
||||||
|
class_name = handler_name.title().replace("_", "") + "Handler"
|
||||||
|
if not hasattr(mod, class_name):
|
||||||
|
print(f"{module_path} defines no class {class_name}")
|
||||||
|
sys.exit(1)
|
||||||
|
HandlerCls = getattr(mod, class_name)
|
||||||
|
|
||||||
# Write JSON output
|
# 4. Build kwargs from ENV
|
||||||
with open(output_file, 'w', encoding='utf-8') as f:
|
handler_kwargs = gather_handler_kwargs(handler_name)
|
||||||
f.write(parser.export_json())
|
|
||||||
|
|
||||||
except Exception as e:
|
# 5. Instantiate
|
||||||
print(f"Error processing {input_file}: {e}")
|
handler = HandlerCls(**handler_kwargs)
|
||||||
|
|
||||||
def main():
|
# 6. Prepare parser
|
||||||
# Initialize parser once for reuse
|
|
||||||
parser = WikivoyageParser()
|
parser = WikivoyageParser()
|
||||||
|
|
||||||
# Get input directory from command line or use current directory
|
# 7. Which dir to walk?
|
||||||
input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.')
|
input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
|
||||||
|
txt_files = list(input_dir.rglob("*.txt"))
|
||||||
|
|
||||||
# Process all .txt files recursively
|
if not txt_files:
|
||||||
for txt_file in input_dir.rglob('*.txt'):
|
print(f"No .txt files found under {input_dir}")
|
||||||
print(f"Processing {txt_file}")
|
else:
|
||||||
process_file(txt_file, parser)
|
for txt in txt_files:
|
||||||
|
await process_file(txt, parser, handler)
|
||||||
|
|
||||||
|
# 7) read concurrency setting
|
||||||
|
try:
|
||||||
|
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
||||||
|
except ValueError:
|
||||||
|
print("Error: MAX_CONCURRENT must be an integer")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if max_conc < 0:
|
||||||
|
print("Error: MAX_CONCURRENT must be >= 0")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 8) schedule tasks
|
||||||
|
if max_conc == 0:
|
||||||
|
# unbounded
|
||||||
|
tasks = [
|
||||||
|
asyncio.create_task(process_file(txt, parser, handler))
|
||||||
|
for txt in txt_files
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
# bounded by semaphore
|
||||||
|
sem = asyncio.Semaphore(max_conc)
|
||||||
|
|
||||||
|
async def bounded(txt):
|
||||||
|
async with sem:
|
||||||
|
return await process_file(txt, parser, handler)
|
||||||
|
|
||||||
|
tasks = [
|
||||||
|
asyncio.create_task(bounded(txt))
|
||||||
|
for txt in txt_files
|
||||||
|
]
|
||||||
|
|
||||||
|
# 9) run them all
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
print("All done.")
|
||||||
|
|
||||||
print("Processing complete")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
asyncio.run(main())
|
12
uv.lock
generated
12
uv.lock
generated
@ -1,5 +1,4 @@
|
|||||||
version = 1
|
version = 1
|
||||||
revision = 1
|
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -18,6 +17,7 @@ source = { virtual = "." }
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "aiofiles" },
|
{ name = "aiofiles" },
|
||||||
{ name = "mwparserfromhell" },
|
{ name = "mwparserfromhell" },
|
||||||
|
{ name = "python-dotenv" },
|
||||||
{ name = "wikitextparser" },
|
{ name = "wikitextparser" },
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -25,6 +25,7 @@ dependencies = [
|
|||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "aiofiles", specifier = ">=24.1.0" },
|
{ name = "aiofiles", specifier = ">=24.1.0" },
|
||||||
{ name = "mwparserfromhell", specifier = ">=0.6.6" },
|
{ name = "mwparserfromhell", specifier = ">=0.6.6" },
|
||||||
|
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
||||||
{ name = "wikitextparser", specifier = ">=0.56.3" },
|
{ name = "wikitextparser", specifier = ">=0.56.3" },
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -41,6 +42,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
|
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "python-dotenv"
|
||||||
|
version = "1.1.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex"
|
name = "regex"
|
||||||
version = "2024.11.6"
|
version = "2024.11.6"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user