mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-11-01 07:32:45 +00:00
Merge pull request #40 from bcye/feature/s3-handler
Feature: S3 handler
This commit is contained in:
@@ -5,6 +5,7 @@ description = "Add your description here"
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"aiobotocore>=2.24.2",
|
||||||
"aiofiles>=24.1.0",
|
"aiofiles>=24.1.0",
|
||||||
"aiohttp>=3.11.16",
|
"aiohttp>=3.11.16",
|
||||||
"asyncssh>=2.20.0",
|
"asyncssh>=2.20.0",
|
||||||
|
|||||||
@@ -54,8 +54,8 @@ async def process_dump(
|
|||||||
sax_parser = xml.sax.make_parser()
|
sax_parser = xml.sax.make_parser()
|
||||||
dump_handler = WikiDumpHandler(mappings, handlers)
|
dump_handler = WikiDumpHandler(mappings, handlers)
|
||||||
sax_parser.setContentHandler(dump_handler)
|
sax_parser.setContentHandler(dump_handler)
|
||||||
|
timeout = aiohttp.ClientTimeout(total = 5000)
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||||
async with session.get(xml_url) as resp:
|
async with session.get(xml_url) as resp:
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||||
@@ -116,7 +116,7 @@ async def main():
|
|||||||
handler_kwargs["max_concurrent"] = max_conc
|
handler_kwargs["max_concurrent"] = max_conc
|
||||||
|
|
||||||
# Instantiate
|
# Instantiate
|
||||||
handler = HandlerCls(**handler_kwargs)
|
handler = await HandlerCls.create(**handler_kwargs)
|
||||||
handlers.append(handler)
|
handlers.append(handler)
|
||||||
|
|
||||||
# 4. Fetch mappings
|
# 4. Fetch mappings
|
||||||
|
|||||||
@@ -14,8 +14,12 @@ class BaseHandler(ABC):
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
_successful_writes = 0
|
_successful_writes = 0
|
||||||
_failed_writes = 0
|
_failed_writes = 0
|
||||||
|
fail_on_error: bool
|
||||||
|
semaphore: asyncio.Semaphore = None
|
||||||
|
|
||||||
def __init__(self, fail_on_error: bool = True, max_concurrent=0, **kwargs):
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
async def create(cls, fail_on_error: bool = True, max_concurrent=0, **kwargs) -> "BaseHandler":
|
||||||
"""
|
"""
|
||||||
Initializes the BaseHandler with optional parameters.
|
Initializes the BaseHandler with optional parameters.
|
||||||
|
|
||||||
@@ -25,10 +29,12 @@ class BaseHandler(ABC):
|
|||||||
0 means unlimited concurrency.
|
0 means unlimited concurrency.
|
||||||
**kwargs: Additional keyword arguments for specific handler implementations.
|
**kwargs: Additional keyword arguments for specific handler implementations.
|
||||||
"""
|
"""
|
||||||
self.fail_on_error = fail_on_error
|
obj = cls(**kwargs)
|
||||||
self.semaphore = None
|
obj.fail_on_error = fail_on_error
|
||||||
if max_concurrent > 0:
|
if max_concurrent > 0:
|
||||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
obj.semaphore = asyncio.Semaphore(max_concurrent)
|
||||||
|
obj.logger.info(f"Handler initialized with fail_on_error={obj.fail_on_error}, max_concurrent={max_concurrent}")
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|||||||
@@ -3,38 +3,39 @@ import aiohttp
|
|||||||
from .base_handler import BaseHandler
|
from .base_handler import BaseHandler
|
||||||
|
|
||||||
class BunnyStorageHandler(BaseHandler):
|
class BunnyStorageHandler(BaseHandler):
|
||||||
def __init__(
|
|
||||||
self,
|
base_url: str
|
||||||
|
headers: dict
|
||||||
|
_session: aiohttp.ClientSession
|
||||||
|
_connector: aiohttp.TCPConnector
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def create(
|
||||||
|
cls,
|
||||||
region: str,
|
region: str,
|
||||||
base_path: str,
|
base_path: str,
|
||||||
api_key: str,
|
api_key: str,
|
||||||
fail_on_error: bool = True,
|
|
||||||
keepalive_timeout: int = 75,
|
keepalive_timeout: int = 75,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
) -> "BunnyStorageHandler":
|
||||||
super().__init__(fail_on_error=fail_on_error, **kwargs)
|
obj = await super().create(**kwargs)
|
||||||
self.base_url = f"https://{region}.bunnycdn.com/{base_path}"
|
obj.base_url = f"https://{region}.bunnycdn.com/{base_path}"
|
||||||
self.headers = {
|
obj.headers = {
|
||||||
"AccessKey": api_key,
|
"AccessKey": api_key,
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"accept": "application/json",
|
"accept": "application/json",
|
||||||
}
|
}
|
||||||
|
|
||||||
# initialized later, in a guaranteed async context
|
# setup the aiohttp session and connector
|
||||||
self._connector = None
|
obj._connector = aiohttp.TCPConnector(
|
||||||
self._session = None
|
# limit is implicitly set to 100
|
||||||
self._keepalive_timeout = keepalive_timeout
|
keepalive_timeout = keepalive_timeout,
|
||||||
|
)
|
||||||
|
obj._session = aiohttp.ClientSession(connector=obj._connector)
|
||||||
|
return obj
|
||||||
|
|
||||||
async def setup_connector(self):
|
|
||||||
if self._session is None:
|
|
||||||
self._connector = aiohttp.TCPConnector(
|
|
||||||
# limit is implicitly set to 100
|
|
||||||
keepalive_timeout = self._keepalive_timeout,
|
|
||||||
)
|
|
||||||
self._session = aiohttp.ClientSession(connector=self._connector)
|
|
||||||
|
|
||||||
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
||||||
await self.setup_connector()
|
|
||||||
payload = json.dumps(entry).encode("utf-8")
|
payload = json.dumps(entry).encode("utf-8")
|
||||||
url = f"{self.base_url}/{uid}.json"
|
url = f"{self.base_url}/{uid}.json"
|
||||||
|
|
||||||
@@ -50,6 +51,7 @@ class BunnyStorageHandler(BaseHandler):
|
|||||||
self.logger.exception(f"Exception while uploading UID={uid}")
|
self.logger.exception(f"Exception while uploading UID={uid}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._session.close()
|
await self._session.close()
|
||||||
await self._connector.close()
|
await self._connector.close()
|
||||||
|
|||||||
@@ -8,7 +8,10 @@ class FilesystemHandler(BaseHandler):
|
|||||||
"""
|
"""
|
||||||
Handler that writes files to the filesystem.
|
Handler that writes files to the filesystem.
|
||||||
"""
|
"""
|
||||||
def __init__(self, output_dir: str, **kwargs):
|
output_dir: Path
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def create(cls, output_dir: str, **kwargs) -> "FilesystemHandler":
|
||||||
"""
|
"""
|
||||||
Initializes the FileSystemHandler with the specified output directory.
|
Initializes the FileSystemHandler with the specified output directory.
|
||||||
|
|
||||||
@@ -16,11 +19,12 @@ class FilesystemHandler(BaseHandler):
|
|||||||
output_dir (str): The directory where files will be written.
|
output_dir (str): The directory where files will be written.
|
||||||
**kwargs: Additional keyword arguments for the BaseHandler.
|
**kwargs: Additional keyword arguments for the BaseHandler.
|
||||||
"""
|
"""
|
||||||
super().__init__(**kwargs)
|
obj = await super().create(**kwargs)
|
||||||
self.output_dir = Path(output_dir)
|
obj.output_dir = Path(output_dir)
|
||||||
# Ensure the target directory exists
|
# Ensure the target directory exists
|
||||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
obj.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
self.logger.info(f"Output directory set to {self.output_dir}")
|
obj.logger.info(f"Output directory set to {obj.output_dir}")
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
||||||
|
|||||||
76
src/output_handlers/s3.py
Normal file
76
src/output_handlers/s3.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""Handler that writes asynchronously."""
|
||||||
|
from .base_handler import BaseHandler
|
||||||
|
import json
|
||||||
|
from aiobotocore.session import AioSession
|
||||||
|
from aiobotocore.client import AioBaseClient
|
||||||
|
from contextlib import AsyncExitStack
|
||||||
|
|
||||||
|
class S3Handler(BaseHandler):
|
||||||
|
"""
|
||||||
|
Handler that writes files to an S3 bucket asynchronously.
|
||||||
|
"""
|
||||||
|
bucket_name: str
|
||||||
|
client: AioBaseClient
|
||||||
|
exit_stack: AsyncExitStack
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def create(cls, url: str, access_key: str, secret_key: str, bucket_name: str, **kwargs) -> "S3Handler":
|
||||||
|
"""
|
||||||
|
Initializes the Handler with the specified S3 endpoint and bucket name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
**kwargs: Additional keyword arguments for the BaseHandler.
|
||||||
|
"""
|
||||||
|
obj = await super().create(**kwargs)
|
||||||
|
obj.bucket_name = bucket_name
|
||||||
|
|
||||||
|
obj.exit_stack = AsyncExitStack()
|
||||||
|
|
||||||
|
session = AioSession()
|
||||||
|
obj.client = await obj.exit_stack.enter_async_context(
|
||||||
|
session.create_client(
|
||||||
|
service_name = 's3',
|
||||||
|
# region_name='us-west-2',
|
||||||
|
aws_secret_access_key = secret_key,
|
||||||
|
aws_access_key_id = access_key,
|
||||||
|
endpoint_url = url,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
await obj._ensure_bucket_exists()
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
|
async def _ensure_bucket_exists(self):
|
||||||
|
"""
|
||||||
|
Ensures that the specified S3 bucket exists, but does not create it if it doesn't.
|
||||||
|
"""
|
||||||
|
# this will raise an error if the bucket does not exist
|
||||||
|
await self.client.head_bucket(Bucket=self.bucket_name)
|
||||||
|
|
||||||
|
|
||||||
|
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
||||||
|
"""
|
||||||
|
Asynchronously writes a single entry to the bucket.
|
||||||
|
"""
|
||||||
|
data = json.dumps(entry).encode('utf-8')
|
||||||
|
try:
|
||||||
|
response = await self.client.put_object(
|
||||||
|
Bucket = self.bucket_name,
|
||||||
|
Key = f"{uid}.json",
|
||||||
|
Body = data
|
||||||
|
)
|
||||||
|
|
||||||
|
if response['ResponseMetadata']['HTTPStatusCode'] not in (200, 201):
|
||||||
|
raise Exception(f"Response: {response}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except:
|
||||||
|
self.logger.exception(f"Failed to write entry {uid} to bucket {self.bucket_name}.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
await self.client.close()
|
||||||
|
await self.exit_stack.__aexit__(None, None, None)
|
||||||
|
await super().close()
|
||||||
@@ -97,5 +97,6 @@ class WikiDumpHandler(xml.sax.ContentHandler):
|
|||||||
|
|
||||||
# Write to all handlers concurrently
|
# Write to all handlers concurrently
|
||||||
await asyncio.gather(
|
await asyncio.gather(
|
||||||
*[handler.write_entry(entry, uid) for handler in self.handlers]
|
*[handler.write_entry(entry, uid) for handler in self.handlers],
|
||||||
|
return_exceptions = True,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user