mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-11-01 15:42:43 +00:00
move code to dedicated src/ folder
This commit is contained in:
132
src/main.py
Normal file
132
src/main.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import zlib
|
||||
import bz2
|
||||
import asyncio
|
||||
import logging
|
||||
import importlib
|
||||
import xml.sax
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import aiohttp
|
||||
from transformers import fetch_mappings, WikiDumpHandler, WikivoyageParser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def gather_handler_kwargs(handler_name: str) -> dict:
|
||||
"""
|
||||
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
|
||||
E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
|
||||
"""
|
||||
prefix = f"HANDLER_{handler_name.upper()}_"
|
||||
kwargs = {}
|
||||
|
||||
for env_key, val in os.environ.items():
|
||||
if not env_key.startswith(prefix):
|
||||
continue
|
||||
param = env_key.replace(prefix, "").lower()
|
||||
# cast ints
|
||||
if val.isdigit():
|
||||
val = int(val)
|
||||
# cast bools
|
||||
elif val.lower() in ("true", "false"):
|
||||
val = val.lower() == "true"
|
||||
kwargs[param] = val
|
||||
logger.debug(f"Handler kwargs: {kwargs}")
|
||||
return kwargs
|
||||
|
||||
|
||||
async def process_dump(
|
||||
mappings: dict[str, str], handler, max_concurrent: int
|
||||
):
|
||||
"""
|
||||
Stream-download the bzip2-compressed XML dump and feed to SAX.
|
||||
"""
|
||||
xml_url = (
|
||||
"https://dumps.wikimedia.org/"
|
||||
"enwikivoyage/latest/"
|
||||
"enwikivoyage-latest-pages-articles.xml.bz2"
|
||||
)
|
||||
decomp = bz2.BZ2Decompressor()
|
||||
sax_parser = xml.sax.make_parser()
|
||||
dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
|
||||
sax_parser.setContentHandler(dump_handler)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(xml_url) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||
data = decomp.decompress(chunk)
|
||||
if not data:
|
||||
continue
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
sax_parser.feed(text)
|
||||
sax_parser.close()
|
||||
if dump_handler.tasks:
|
||||
await asyncio.gather(*dump_handler.tasks)
|
||||
|
||||
async def main():
|
||||
# 1. Which handler to load?
|
||||
handler_name = os.getenv("HANDLER")
|
||||
if not handler_name:
|
||||
logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
|
||||
sys.exit(1)
|
||||
|
||||
# 2. Dynamic import
|
||||
module_path = f"output_handlers.{handler_name}"
|
||||
try:
|
||||
mod = importlib.import_module(module_path)
|
||||
except ImportError as e:
|
||||
logger.error(f"Error loading handler module {module_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# 3. Find the class: e.g. "sftp" → "SftpHandler"
|
||||
class_name = handler_name.title().replace("_", "") + "Handler"
|
||||
if not hasattr(mod, class_name):
|
||||
logger.error(f"{module_path} defines no class {class_name}")
|
||||
sys.exit(1)
|
||||
HandlerCls = getattr(mod, class_name)
|
||||
|
||||
logger.info(f"Using handler from {module_path}")
|
||||
|
||||
# 4. Build kwargs from ENV
|
||||
handler_kwargs = gather_handler_kwargs(handler_name)
|
||||
|
||||
# 5. Instantiate
|
||||
handler = HandlerCls(**handler_kwargs)
|
||||
|
||||
# 6. read concurrency setting
|
||||
try:
|
||||
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
||||
except ValueError:
|
||||
raise ValueError("MAX_CONCURRENT must be an integer")
|
||||
|
||||
if max_conc < 0:
|
||||
raise ValueError("MAX_CONCURRENT must be >= 0")
|
||||
|
||||
|
||||
# 7. Fetch mappings
|
||||
logger.info("Fetching mappings from SQL dump…")
|
||||
mappings = await fetch_mappings()
|
||||
logger.info(f"Got {len(mappings)} wikibase_item mappings.")
|
||||
|
||||
# 8. Stream & split the XML dump
|
||||
logger.info("Processing XML dump…")
|
||||
await process_dump(mappings, handler, max_conc)
|
||||
|
||||
# 5. Finish up
|
||||
await handler.close()
|
||||
logger.info("All done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
if os.getenv("DEBUG"):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
asyncio.run(main())
|
||||
3
src/output_handlers/__init__.py
Normal file
3
src/output_handlers/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .base_handler import BaseHandler
|
||||
from .filesystem import FilesystemHandler
|
||||
from .bunny_storage import BunnyStorageHandler
|
||||
65
src/output_handlers/base_handler.py
Normal file
65
src/output_handlers/base_handler.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Reference handler for output handlers."""
|
||||
from abc import ABC, abstractmethod
|
||||
import logging
|
||||
|
||||
|
||||
|
||||
class BaseHandler(ABC):
|
||||
"""
|
||||
Abstract base class for output handlers. Defines the standardized interface that all output handlers must implement.
|
||||
In particular, it requires the implementation of an asynchronous ("private") method `_write_entry` to write a single entry to the output.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_successful_writes = 0
|
||||
_failed_writes = 0
|
||||
|
||||
def __init__(self, fail_on_error: bool = True, **kwargs):
|
||||
"""
|
||||
Initializes the BaseHandler with optional parameters.
|
||||
|
||||
Args:
|
||||
fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True.
|
||||
**kwargs: Additional keyword arguments for specific handler implementations.
|
||||
"""
|
||||
self.fail_on_error = fail_on_error
|
||||
|
||||
|
||||
@abstractmethod
|
||||
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
||||
"""
|
||||
Asynchronously writes a single entry to the output. This method should gracefully handle any exceptions that may occur during the writing process and simply return False if an error occurs.
|
||||
|
||||
Args:
|
||||
entry (dict): The entry to write (will be JSON-encoded).
|
||||
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
|
||||
Returns:
|
||||
bool: True if the entry was written successfully, False otherwise.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
async def write_entry(self, entry: dict, uid: str):
|
||||
"""
|
||||
Public method to write an entry to the output. It handles exceptions and logs errors.
|
||||
|
||||
Args:
|
||||
entry (dict): The entry to write (will be JSON-encoded).
|
||||
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
|
||||
"""
|
||||
success = await self._write_entry(entry, uid)
|
||||
if success:
|
||||
self.logger.debug(f"Successfully wrote entry with UID {uid}")
|
||||
self._successful_writes += 1
|
||||
else:
|
||||
self.logger.error(f"Failed to write entry with UID {uid}")
|
||||
self._failed_writes += 1
|
||||
if self.fail_on_error:
|
||||
raise Exception(f"Failed to write entry with UID {uid}")
|
||||
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Closes the handler. This method should be overridden by subclasses if they need to perform any cleanup operations.
|
||||
"""
|
||||
self.logger.info(f"Wrote {self._successful_writes+self._failed_writes} entries: {self._successful_writes} successful, {self._failed_writes} failed.")
|
||||
55
src/output_handlers/bunny_storage.py
Normal file
55
src/output_handlers/bunny_storage.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import json
|
||||
import aiohttp
|
||||
from .base_handler import BaseHandler
|
||||
|
||||
class BunnyStorageHandler(BaseHandler):
|
||||
def __init__(
|
||||
self,
|
||||
region: str,
|
||||
base_path: str,
|
||||
api_key: str,
|
||||
fail_on_error: bool = True,
|
||||
keepalive_timeout: int = 75,
|
||||
):
|
||||
super().__init__(fail_on_error=fail_on_error)
|
||||
self.base_url = f"https://{region}.bunnycdn.com/{base_path}"
|
||||
self.headers = {
|
||||
"AccessKey": api_key,
|
||||
"Content-Type": "application/json",
|
||||
"accept": "application/json",
|
||||
}
|
||||
|
||||
# initialized later, in a guaranteed async context
|
||||
self._connector = None
|
||||
self._session = None
|
||||
self._keepalive_timeout = keepalive_timeout
|
||||
|
||||
async def setup_connector(self):
|
||||
if self._session is None:
|
||||
self._connector = aiohttp.TCPConnector(
|
||||
# limit is implicitly set to 100
|
||||
keepalive_timeout = self._keepalive_timeout,
|
||||
)
|
||||
self._session = aiohttp.ClientSession(connector=self._connector)
|
||||
|
||||
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
||||
await self.setup_connector()
|
||||
payload = json.dumps(entry).encode("utf-8")
|
||||
url = f"{self.base_url}/{uid}.json"
|
||||
|
||||
try:
|
||||
async with self._session.put(url, data=payload, headers=self.headers) as resp:
|
||||
if resp.status in (200, 201, 204):
|
||||
return True
|
||||
body = await resp.text()
|
||||
self.logger.error(f"Upload failed UID={uid} status={resp.status} body={body}")
|
||||
return False
|
||||
|
||||
except Exception:
|
||||
self.logger.exception(f"Exception while uploading UID={uid}")
|
||||
return False
|
||||
|
||||
async def close(self):
|
||||
await self._session.close()
|
||||
await self._connector.close()
|
||||
await super().close()
|
||||
44
src/output_handlers/filesystem.py
Normal file
44
src/output_handlers/filesystem.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Handler that writes files to the filesystem."""
|
||||
from pathlib import Path
|
||||
import aiofiles
|
||||
from .base_handler import BaseHandler
|
||||
import json
|
||||
|
||||
class FilesystemHandler(BaseHandler):
|
||||
"""
|
||||
Handler that writes files to the filesystem.
|
||||
"""
|
||||
def __init__(self, output_dir: str, **kwargs):
|
||||
"""
|
||||
Initializes the FileSystemHandler with the specified output directory.
|
||||
|
||||
Args:
|
||||
output_dir (str): The directory where files will be written.
|
||||
**kwargs: Additional keyword arguments for the BaseHandler.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.output_dir = Path(output_dir)
|
||||
# Ensure the target directory exists
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.logger.info(f"Output directory set to {self.output_dir}")
|
||||
|
||||
|
||||
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
||||
"""
|
||||
Asynchronously writes a single entry to the filesystem.
|
||||
|
||||
Args:
|
||||
entry (dict): The entry to write (will be JSON-encoded).
|
||||
uid (str): The unique identifier for the entry.
|
||||
|
||||
Returns:
|
||||
bool: True if the entry was written successfully, False otherwise.
|
||||
"""
|
||||
try:
|
||||
file_path = self.output_dir / f"{uid}.json"
|
||||
async with aiofiles.open(file_path, 'w') as f:
|
||||
await f.write(json.dumps(entry))
|
||||
return True
|
||||
except IOError as e:
|
||||
self.logger.error(f"Error writing entry {uid}: {e}")
|
||||
return False
|
||||
3
src/transformers/__init__.py
Normal file
3
src/transformers/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .fetch_mappings import fetch_mappings
|
||||
from .wiki_dump_handler import WikiDumpHandler
|
||||
from .parser import WikivoyageParser
|
||||
42
src/transformers/fetch_mappings.py
Normal file
42
src/transformers/fetch_mappings.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from logging import getLogger
|
||||
import zlib
|
||||
import re
|
||||
import aiohttp
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
async def fetch_mappings() -> dict[str, str]:
|
||||
"""
|
||||
Download and gunzip the page_props SQL dump, extract
|
||||
page→wikibase_item mappings.
|
||||
"""
|
||||
sql_url = (
|
||||
"https://dumps.wikimedia.org/"
|
||||
"enwikivoyage/latest/"
|
||||
"enwikivoyage-latest-page_props.sql.gz"
|
||||
)
|
||||
# decompress gzip
|
||||
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||
# regex for tuples: (page,'prop','value',NULL_or_number)
|
||||
tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
|
||||
buffer = ""
|
||||
mappings: dict[str, str] = {}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(sql_url) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||
data = decomp.decompress(chunk)
|
||||
if not data:
|
||||
continue
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
buffer += text
|
||||
for m in tuple_re.finditer(buffer):
|
||||
page_id, prop, value = m.group(1), m.group(2), m.group(3)
|
||||
if prop == "wikibase_item":
|
||||
logger.debug(f"Found mapping {page_id} -> {value}")
|
||||
mappings[page_id] = value
|
||||
# keep tail to handle split tuples
|
||||
if len(buffer) > 1000:
|
||||
buffer = buffer[-1000:]
|
||||
return mappings
|
||||
348
src/transformers/parser.py
Normal file
348
src/transformers/parser.py
Normal file
@@ -0,0 +1,348 @@
|
||||
"""Where the magic happens: parsing wikitext into a structured JSON tree."""
|
||||
import mwparserfromhell as mwp
|
||||
import mwparserfromhell.nodes as nodes
|
||||
import json
|
||||
from typing import Dict
|
||||
|
||||
|
||||
DOCUMENT_TEMPLATES = [
|
||||
"pagebanner", "mapframe", "routebox", "geo", "isPartOf",
|
||||
"usablecity", "guidecity", "outlinecity"
|
||||
]
|
||||
LISTING_TEMPLATES = [
|
||||
"see", "do", "buy", "eat", "drink", "sleep", "listing"
|
||||
]
|
||||
|
||||
class WikivoyageParser:
|
||||
"""
|
||||
A parser for Wikivoyage wikitext to JSON tree structure.
|
||||
This class uses mwparserfromhell to parse the wikitext and convert it into a structured JSON format.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.root = {
|
||||
"type": "root",
|
||||
"properties": {},
|
||||
"children": []
|
||||
}
|
||||
self.current_section = self.root
|
||||
|
||||
def parse(self, wikitext: str) -> Dict:
|
||||
"""Parse wikitext and return structured JSON tree"""
|
||||
self.root = {
|
||||
"type": "root",
|
||||
"properties": {},
|
||||
"children": []
|
||||
}
|
||||
self.current_section = self.root
|
||||
|
||||
# Parse the wikitext
|
||||
parsed = mwp.parse(wikitext)
|
||||
|
||||
# Process the parsed content
|
||||
self._process_nodes(parsed)
|
||||
|
||||
return self.root
|
||||
|
||||
def _process_nodes(self, wikicode):
|
||||
"""Process all nodes in the wikicode"""
|
||||
current_text = ""
|
||||
|
||||
for node in wikicode.nodes:
|
||||
# Handle different node types
|
||||
if isinstance(node, nodes.heading.Heading):
|
||||
# First flush any pending text
|
||||
if current_text:
|
||||
self._add_text_node(current_text)
|
||||
current_text = ""
|
||||
|
||||
# Create new section
|
||||
self._handle_heading(node)
|
||||
|
||||
elif isinstance(node, nodes.template.Template):
|
||||
# First flush any pending text
|
||||
if current_text:
|
||||
self._add_text_node(current_text)
|
||||
current_text = ""
|
||||
|
||||
# Handle template
|
||||
self._handle_template(node)
|
||||
|
||||
elif isinstance(node, nodes.text.Text):
|
||||
# Accumulate text
|
||||
current_text += str(node.value)
|
||||
|
||||
elif isinstance(node, nodes.tag.Tag):
|
||||
# Handle tag (potential styling)
|
||||
tag_text = self._convert_tag_to_markdown(node)
|
||||
current_text += tag_text
|
||||
|
||||
elif isinstance(node, nodes.wikilink.Wikilink):
|
||||
# Handle wikilink
|
||||
link_text = self._convert_wikilink_to_markdown(node)
|
||||
current_text += link_text
|
||||
|
||||
elif isinstance(node, nodes.external_link.ExternalLink):
|
||||
# Handle external link
|
||||
link_text = self._convert_external_link_to_markdown(node)
|
||||
current_text += link_text
|
||||
|
||||
elif isinstance(node, nodes.comment.Comment):
|
||||
# Skip comments
|
||||
pass
|
||||
|
||||
else:
|
||||
# Process other nodes as text
|
||||
current_text += str(node)
|
||||
|
||||
# Add any remaining text
|
||||
if current_text:
|
||||
self._add_text_node(current_text)
|
||||
|
||||
def _add_text_node(self, text: str):
|
||||
"""Add a text node to the current section"""
|
||||
# Avoid adding empty text nodes
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
text_node = {
|
||||
"type": "text",
|
||||
"properties": {
|
||||
"markdown": text.strip()
|
||||
},
|
||||
"children": []
|
||||
}
|
||||
|
||||
self.current_section["children"].append(text_node)
|
||||
|
||||
def _handle_heading(self, heading_node):
|
||||
"""Handle a heading node by creating a new section"""
|
||||
level = heading_node.level
|
||||
title = str(heading_node.title).strip()
|
||||
|
||||
# Create new section node
|
||||
section = {
|
||||
"type": "section",
|
||||
"properties": {
|
||||
"title": title,
|
||||
"level": level
|
||||
},
|
||||
"children": []
|
||||
}
|
||||
|
||||
# Find the appropriate parent section based on level
|
||||
parent = self.root
|
||||
|
||||
# If the level is 1, the parent is the root
|
||||
if level > 1:
|
||||
# Start from root and traverse the tree
|
||||
current = self.root
|
||||
current_level = 0
|
||||
|
||||
for child in reversed(self._get_all_sections()):
|
||||
child_level = child["properties"]["level"]
|
||||
if child_level < level:
|
||||
parent = child
|
||||
break
|
||||
|
||||
# Add the section to its parent
|
||||
parent["children"].append(section)
|
||||
|
||||
# Update current section
|
||||
self.current_section = section
|
||||
|
||||
def _get_all_sections(self):
|
||||
"""Get all sections in the document in the order they appear"""
|
||||
sections = []
|
||||
|
||||
def collect_sections(node):
|
||||
if node["type"] == "section":
|
||||
sections.append(node)
|
||||
for child in node["children"]:
|
||||
if child["type"] == "section":
|
||||
collect_sections(child)
|
||||
|
||||
collect_sections(self.root)
|
||||
return sections
|
||||
|
||||
def _handle_template(self, template_node):
|
||||
"""Handle a template node"""
|
||||
template_name = str(template_node.name).strip().lower()
|
||||
|
||||
# Check if it's a document-wide template
|
||||
if template_name in DOCUMENT_TEMPLATES:
|
||||
self._handle_document_template(template_node)
|
||||
return
|
||||
|
||||
# Check if it's a listing template
|
||||
if template_name in LISTING_TEMPLATES:
|
||||
self._handle_listing_template(template_node)
|
||||
return
|
||||
|
||||
# Handle other templates as regular nodes
|
||||
self._handle_other_template(template_node)
|
||||
|
||||
def _handle_document_template(self, template_node):
|
||||
"""Handle document-wide templates by adding to root properties"""
|
||||
template_name = str(template_node.name).strip().lower()
|
||||
|
||||
# Extract parameters
|
||||
params = {}
|
||||
for param in template_node.params:
|
||||
name = str(param.name).strip()
|
||||
value = str(param.value).strip()
|
||||
params[name] = value
|
||||
|
||||
# Add to root properties
|
||||
if template_name not in self.root["properties"]:
|
||||
self.root["properties"][template_name] = {}
|
||||
|
||||
self.root["properties"][template_name] = params
|
||||
|
||||
def _handle_listing_template(self, template_node):
|
||||
"""Handle listing templates (see, do, buy, eat, drink, sleep)"""
|
||||
template_name = str(template_node.name).strip().lower()
|
||||
|
||||
# Extract parameters
|
||||
properties = {}
|
||||
for param in template_node.params:
|
||||
name = str(param.name).strip()
|
||||
value = str(param.value).strip()
|
||||
|
||||
# Convert content to markdown if it's in the 'content' parameter
|
||||
if name == "content":
|
||||
value = self._convert_wikicode_to_markdown(param.value)
|
||||
|
||||
properties[name] = value
|
||||
|
||||
# Create listing node
|
||||
listing_node = {
|
||||
"type": template_name,
|
||||
"properties": properties,
|
||||
"children": []
|
||||
}
|
||||
|
||||
# Add to current section
|
||||
self.current_section["children"].append(listing_node)
|
||||
|
||||
def _handle_other_template(self, template_node):
|
||||
"""Handle other templates as general template nodes"""
|
||||
template_name = str(template_node.name).strip().lower()
|
||||
|
||||
# Extract parameters
|
||||
properties = {
|
||||
"name": template_name,
|
||||
"params": {}
|
||||
}
|
||||
|
||||
for param in template_node.params:
|
||||
name = str(param.name).strip()
|
||||
value = str(param.value).strip()
|
||||
properties["params"][name] = value
|
||||
|
||||
# Create template node
|
||||
template_node = {
|
||||
"type": "template",
|
||||
"properties": properties,
|
||||
"children": []
|
||||
}
|
||||
|
||||
# Add to current section
|
||||
self.current_section["children"].append(template_node)
|
||||
|
||||
def _convert_wikicode_to_markdown(self, wikicode) -> str:
|
||||
"""Convert wikicode to markdown"""
|
||||
markdown = ""
|
||||
|
||||
for node in wikicode.nodes:
|
||||
if isinstance(node, nodes.text.Text):
|
||||
markdown += str(node.value)
|
||||
|
||||
elif isinstance(node, nodes.tag.Tag):
|
||||
markdown += self._convert_tag_to_markdown(node)
|
||||
|
||||
elif isinstance(node, nodes.wikilink.Wikilink):
|
||||
markdown += self._convert_wikilink_to_markdown(node)
|
||||
|
||||
elif isinstance(node, nodes.external_link.ExternalLink):
|
||||
markdown += self._convert_external_link_to_markdown(node)
|
||||
|
||||
else:
|
||||
# For other nodes, just use their string representation
|
||||
markdown += str(node)
|
||||
|
||||
return markdown.strip()
|
||||
|
||||
def _convert_tag_to_markdown(self, tag_node) -> str:
|
||||
"""Convert HTML tag to markdown"""
|
||||
tag = str(tag_node.tag).lower()
|
||||
content = str(tag_node.contents)
|
||||
|
||||
# Convert the content recursively to handle nested tags
|
||||
if tag_node.contents:
|
||||
content = self._convert_wikicode_to_markdown(tag_node.contents)
|
||||
|
||||
# Handle different tags
|
||||
if tag == 'b' or tag == 'strong':
|
||||
return f"**{content}**"
|
||||
elif tag == 'i' or tag == 'em':
|
||||
return f"*{content}*"
|
||||
elif tag == 'u':
|
||||
return f"_{content}_"
|
||||
elif tag == 'strike' or tag == 's' or tag == 'del':
|
||||
return f"~~{content}~~"
|
||||
elif tag == 'code':
|
||||
return f"`{content}`"
|
||||
elif tag == 'pre':
|
||||
return f"```\n{content}\n```"
|
||||
elif tag == 'br':
|
||||
return "\n"
|
||||
elif tag == 'hr':
|
||||
return "\n---\n"
|
||||
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
level = int(tag[1])
|
||||
return f"\n{'#' * level} {content}\n"
|
||||
elif tag == 'a':
|
||||
href = ""
|
||||
for attr in tag_node.attributes:
|
||||
if str(attr.name).lower() == 'href':
|
||||
href = str(attr.value)
|
||||
break
|
||||
return f"[{content}]({href})"
|
||||
elif tag == 'img':
|
||||
src = alt = ""
|
||||
for attr in tag_node.attributes:
|
||||
if str(attr.name).lower() == 'src':
|
||||
src = str(attr.value)
|
||||
elif str(attr.name).lower() == 'alt':
|
||||
alt = str(attr.value)
|
||||
return f""
|
||||
else:
|
||||
# For unknown tags, just return the content
|
||||
return content
|
||||
|
||||
def _convert_wikilink_to_markdown(self, wikilink_node) -> str:
|
||||
"""Convert wikilink to markdown"""
|
||||
title = str(wikilink_node.title)
|
||||
|
||||
if wikilink_node.text:
|
||||
text = str(wikilink_node.text)
|
||||
return f"[{text}]({title})"
|
||||
else:
|
||||
return f"[{title}]({title})"
|
||||
|
||||
def _convert_external_link_to_markdown(self, link_node) -> str:
|
||||
"""Convert external link to markdown"""
|
||||
url = str(link_node.url)
|
||||
|
||||
if link_node.title:
|
||||
title = str(link_node.title)
|
||||
return f"[{title}]({url})"
|
||||
else:
|
||||
return url
|
||||
|
||||
def export_json(self, root=None, indent=2) -> str:
|
||||
"""Export the tree as JSON string"""
|
||||
if root is None:
|
||||
root = self.root
|
||||
|
||||
return json.dumps(root, indent=indent)
|
||||
96
src/transformers/wiki_dump_handler.py
Normal file
96
src/transformers/wiki_dump_handler.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from logging import getLogger
|
||||
import xml.sax
|
||||
import asyncio
|
||||
from .parser import WikivoyageParser
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
class WikiDumpHandler(xml.sax.ContentHandler):
|
||||
"""
|
||||
SAX handler that, for each <page> whose <id> is in mappings,
|
||||
collects the <text> and schedules an async task to parse
|
||||
and write via the user‐supplied handler.
|
||||
"""
|
||||
|
||||
def __init__(self, mappings, handler, max_concurrent):
|
||||
super().__init__()
|
||||
self.mappings = mappings
|
||||
self.handler = handler
|
||||
self.sem = (
|
||||
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
||||
)
|
||||
self.tasks: list[asyncio.Task] = []
|
||||
|
||||
self.currentTag: str | None = None
|
||||
self.inPage = False
|
||||
self.inRevision = False
|
||||
self.inText = False
|
||||
self.currentPageId: str | None = None
|
||||
self.currentText: list[str] = []
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
self.currentTag = name
|
||||
if name == "page":
|
||||
logger.debug("start page")
|
||||
self.inPage = True
|
||||
self.currentPageId = None
|
||||
self.currentText = []
|
||||
elif name == "revision":
|
||||
logger.debug("start revision")
|
||||
self.inRevision = True
|
||||
elif name == "text" and self.inRevision:
|
||||
logger.debug("start text")
|
||||
self.inText = True
|
||||
|
||||
def endElement(self, name):
|
||||
if name == "page":
|
||||
logger.debug("end page")
|
||||
pid = self.currentPageId
|
||||
if pid and pid in self.mappings:
|
||||
wd_id = self.mappings[pid]
|
||||
text = "".join(self.currentText)
|
||||
logger.debug(f"scheduled {wd_id} for handling")
|
||||
# schedule processing
|
||||
if self.sem:
|
||||
task = asyncio.create_task(self._bounded_process(text, wd_id))
|
||||
else:
|
||||
task = asyncio.create_task(self._process(text, wd_id))
|
||||
self.tasks.append(task)
|
||||
else:
|
||||
logger.debug(f"page {pid} without wikidata id, skipping...")
|
||||
# reset
|
||||
self.inPage = self.inRevision = self.inText = False
|
||||
self.currentPageId = None
|
||||
self.currentText = []
|
||||
elif name == "revision":
|
||||
logger.debug("end revision")
|
||||
self.inRevision = False
|
||||
elif name == "text":
|
||||
logger.debug("end text")
|
||||
self.inText = False
|
||||
self.currentTag = None
|
||||
|
||||
def characters(self, content):
|
||||
# Only filter whitespace for ID fields, preserve all content for text
|
||||
if (
|
||||
self.currentTag == "id"
|
||||
and self.inPage
|
||||
and not self.inRevision
|
||||
and not self.currentPageId
|
||||
):
|
||||
content_stripped = content.strip()
|
||||
if content_stripped: # Only process non-empty ID content
|
||||
self.currentPageId = content_stripped
|
||||
elif self.inText:
|
||||
# Always append text content, even if it's just whitespace or newlines
|
||||
self.currentText.append(content)
|
||||
|
||||
async def _process(self, text: str, uid: str):
|
||||
parser = WikivoyageParser()
|
||||
entry = parser.parse(text)
|
||||
await self.handler.write_entry(entry, uid)
|
||||
|
||||
async def _bounded_process(self, text: str, uid: str):
|
||||
# Only run N at once
|
||||
async with self.sem:
|
||||
await self._process(text, uid)
|
||||
Reference in New Issue
Block a user