mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-06-09 17:34:06 +00:00
Compare commits
37 Commits
docker/0.4
...
main
Author | SHA1 | Date | |
---|---|---|---|
![]() |
8f099dc7bc | ||
![]() |
be28fddeb5 | ||
![]() |
5031f33ea2 | ||
![]() |
60c13fb9ec | ||
![]() |
f73046bd65 | ||
![]() |
5d1ec5bb2e | ||
![]() |
38901474c6 | ||
![]() |
b33201e930 | ||
![]() |
93d99bf062 | ||
![]() |
1a5b9b44e0 | ||
![]() |
3126d2c39b | ||
![]() |
84e9a68bac | ||
![]() |
f67d8d3963 | ||
![]() |
c780a4bb99 | ||
![]() |
1e89b20483 | ||
![]() |
1c16ee87e6 | ||
![]() |
fba9be556e | ||
![]() |
729d4adc62 | ||
![]() |
0c2905c119 | ||
![]() |
08cd8b41fe | ||
![]() |
63babeace3 | ||
![]() |
b18387a83c | ||
![]() |
3e2149ebcc | ||
![]() |
243c4be9fe | ||
![]() |
6faf2a1a97 | ||
![]() |
82520947e0 | ||
![]() |
ac2ab450b9 | ||
![]() |
4bed99ca8c | ||
![]() |
96843f104c | ||
![]() |
75503c971d | ||
![]() |
559bcdda44 | ||
![]() |
322df10561 | ||
![]() |
59b2aeb1f4 | ||
![]() |
d48e75ce01 | ||
![]() |
5e74672049 | ||
![]() |
4d25bc9e4c | ||
![]() |
201387be5e |
40
.github/workflows/publish-types.yaml
vendored
Normal file
40
.github/workflows/publish-types.yaml
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
# Example from https://docs.github.com/en/actions/use-cases-and-examples/publishing-packages/publishing-nodejs-packages#publishing-packages-to-the-npm-registry
|
||||
|
||||
name: Publish Types Package to npmjs
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "types/*"
|
||||
|
||||
defaults:
|
||||
run:
|
||||
working-directory: types
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: pnpm/action-setup@v4
|
||||
name: Install pnpm
|
||||
with:
|
||||
version: 10
|
||||
run_install: false
|
||||
|
||||
# Setup .npmrc file to publish to npm
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "20.x"
|
||||
cache: "pnpm"
|
||||
cache-dependency-path: "types/pnpm-lock.yaml"
|
||||
registry-url: "https://registry.npmjs.org"
|
||||
|
||||
- run: pnpm install --frozen-lockfile
|
||||
- run: pnpm tsc
|
||||
- run: pnpm publish --provenance --access public --no-git-checks
|
||||
env:
|
||||
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
23
.github/workflows/test-parser.yaml
vendored
Normal file
23
.github/workflows/test-parser.yaml
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
run-tests:
|
||||
name: Unit-Test Parser
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: astral-sh/setup-uv@v5
|
||||
|
||||
- name: "Set up Python"
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version-file: ".python-version"
|
||||
|
||||
- name: Install the project
|
||||
run: uv sync --locked --dev
|
||||
|
||||
- name: Run tests
|
||||
run: PYTHONPATH=. uv run pytest
|
@ -7,4 +7,4 @@ RUN uv sync --frozen
|
||||
|
||||
COPY . .
|
||||
|
||||
CMD ["uv", "run", "transform-documents.py"]
|
||||
CMD ["uv", "run", "main.py"]
|
16
README.md
16
README.md
@ -15,3 +15,19 @@ TypeScript types for consuming the json output are available, you may install th
|
||||
## Documentation
|
||||
|
||||
See [docs](docs) for more information on how to use this utility.
|
||||
|
||||
## Testing
|
||||
|
||||
Run `PYTHONPATH=. pytest` from inside the venv
|
||||
|
||||
## License
|
||||
|
||||
### Code
|
||||
|
||||
(c) 2025 bcye and moll-re
|
||||
|
||||
All code and documentation unless otherwise stated is licensed under the AGPLv3 license, refer to [LICENSE](LICENSE) for the full license text. The types package and all its code is [licensed under MIT](types/LICENSE).
|
||||
|
||||
### Examples
|
||||
|
||||
Files in the `docs/example` and `tests/fixtures` are copies (.txt) or derivatives (.json) of the Boston Article on Wikivoyage and licensed under CC BY-SA 4.0. A [list of contributors is available on the original article](https://en.wikivoyage.org/w/index.php?title=Boston&action=history).
|
5
docs/types.md
Normal file
5
docs/types.md
Normal file
@ -0,0 +1,5 @@
|
||||
# Types Package
|
||||
|
||||
## Publishing new versions
|
||||
|
||||
Up version in package.json and create a new commit and tag it with "types/x.y.z", the version will be published when the tag is pushed to GitHub
|
143
main.py
Normal file
143
main.py
Normal file
@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import zlib
|
||||
import bz2
|
||||
import asyncio
|
||||
import logging
|
||||
import importlib
|
||||
import xml.sax
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import aiohttp
|
||||
from transformers import fetch_mappings, WikiDumpHandler, WikivoyageParser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def gather_handler_kwargs(handler_name: str) -> dict:
|
||||
"""
|
||||
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
|
||||
E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
|
||||
"""
|
||||
prefix = f"HANDLER_{handler_name.upper()}_"
|
||||
kwargs = {}
|
||||
|
||||
for env_key, val in os.environ.items():
|
||||
if not env_key.startswith(prefix):
|
||||
continue
|
||||
param = env_key.replace(prefix, "").lower()
|
||||
# cast ints
|
||||
if val.isdigit():
|
||||
val = int(val)
|
||||
# cast bools
|
||||
elif val.lower() in ("true", "false"):
|
||||
val = val.lower() == "true"
|
||||
kwargs[param] = val
|
||||
logger.debug(f"Handler kwargs: {kwargs}")
|
||||
return kwargs
|
||||
|
||||
|
||||
async def process_dump(
|
||||
mappings: dict[str, str], handlers
|
||||
):
|
||||
"""
|
||||
Stream-download the bzip2-compressed XML dump and feed to SAX.
|
||||
"""
|
||||
xml_url = (
|
||||
"https://dumps.wikimedia.org/"
|
||||
"enwikivoyage/latest/"
|
||||
"enwikivoyage-latest-pages-articles.xml.bz2"
|
||||
)
|
||||
decomp = bz2.BZ2Decompressor()
|
||||
sax_parser = xml.sax.make_parser()
|
||||
dump_handler = WikiDumpHandler(mappings, handlers)
|
||||
sax_parser.setContentHandler(dump_handler)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(xml_url) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||
data = decomp.decompress(chunk)
|
||||
if not data:
|
||||
continue
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
sax_parser.feed(text)
|
||||
sax_parser.close()
|
||||
if dump_handler.tasks:
|
||||
await asyncio.gather(*dump_handler.tasks)
|
||||
|
||||
async def main():
|
||||
# 1. Which handler(s) to load?
|
||||
handler_names = os.getenv("HANDLER", "").split(",")
|
||||
if not handler_names or not handler_names[0]:
|
||||
logger.error("Error: set ENV HANDLER (e.g. 'filesystem' or 'filesystem,sftp')")
|
||||
sys.exit(1)
|
||||
|
||||
# 2. Read concurrency setting
|
||||
try:
|
||||
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
||||
except ValueError:
|
||||
raise ValueError("MAX_CONCURRENT must be an integer")
|
||||
|
||||
if max_conc < 0:
|
||||
raise ValueError("MAX_CONCURRENT must be >= 0")
|
||||
|
||||
handlers = []
|
||||
|
||||
# 3. Load each handler
|
||||
for handler_name in handler_names:
|
||||
handler_name = handler_name.strip()
|
||||
if not handler_name:
|
||||
continue
|
||||
|
||||
# Dynamic import
|
||||
module_path = f"output_handlers.{handler_name}"
|
||||
try:
|
||||
mod = importlib.import_module(module_path)
|
||||
except ImportError as e:
|
||||
logger.error(f"Error loading handler module {module_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Find the class: e.g. "sftp" → "SftpHandler"
|
||||
class_name = handler_name.title().replace("_", "") + "Handler"
|
||||
if not hasattr(mod, class_name):
|
||||
logger.error(f"{module_path} defines no class {class_name}")
|
||||
sys.exit(1)
|
||||
HandlerCls = getattr(mod, class_name)
|
||||
|
||||
logger.info(f"Using handler from {module_path}")
|
||||
|
||||
# Build kwargs from ENV
|
||||
handler_kwargs = gather_handler_kwargs(handler_name)
|
||||
|
||||
# Add max_concurrent to kwargs
|
||||
handler_kwargs["max_concurrent"] = max_conc
|
||||
|
||||
# Instantiate
|
||||
handler = HandlerCls(**handler_kwargs)
|
||||
handlers.append(handler)
|
||||
|
||||
# 4. Fetch mappings
|
||||
logger.info("Fetching mappings from SQL dump…")
|
||||
mappings = await fetch_mappings()
|
||||
logger.info(f"Got {len(mappings)} wikibase_item mappings.")
|
||||
|
||||
# 5. Stream & split the XML dump
|
||||
logger.info("Processing XML dump…")
|
||||
await process_dump(mappings, handlers) # Pass 0 as max_concurrent since handlers handle it
|
||||
|
||||
# 6. Finish up
|
||||
await asyncio.gather(*[handler.close() for handler in handlers])
|
||||
logger.info("All done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
if os.getenv("DEBUG"):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
asyncio.run(main())
|
@ -1,6 +1,7 @@
|
||||
"""Reference handler for output handlers."""
|
||||
from abc import ABC, abstractmethod
|
||||
import logging
|
||||
import asyncio
|
||||
|
||||
|
||||
|
||||
@ -14,15 +15,20 @@ class BaseHandler(ABC):
|
||||
_successful_writes = 0
|
||||
_failed_writes = 0
|
||||
|
||||
def __init__(self, fail_on_error: bool = True, **kwargs):
|
||||
def __init__(self, fail_on_error: bool = True, max_concurrent=0, **kwargs):
|
||||
"""
|
||||
Initializes the BaseHandler with optional parameters.
|
||||
|
||||
Args:
|
||||
fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True.
|
||||
max_concurrent: Maximum number of concurrent write operations.
|
||||
0 means unlimited concurrency.
|
||||
**kwargs: Additional keyword arguments for specific handler implementations.
|
||||
"""
|
||||
self.fail_on_error = fail_on_error
|
||||
self.semaphore = None
|
||||
if max_concurrent > 0:
|
||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
|
||||
@abstractmethod
|
||||
@ -47,6 +53,10 @@ class BaseHandler(ABC):
|
||||
entry (dict): The entry to write (will be JSON-encoded).
|
||||
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
|
||||
"""
|
||||
if self.semaphore:
|
||||
async with self.semaphore:
|
||||
success = await self._write_entry(entry, uid)
|
||||
else:
|
||||
success = await self._write_entry(entry, uid)
|
||||
if success:
|
||||
self.logger.debug(f"Successfully wrote entry with UID {uid}")
|
||||
|
@ -10,8 +10,9 @@ class BunnyStorageHandler(BaseHandler):
|
||||
api_key: str,
|
||||
fail_on_error: bool = True,
|
||||
keepalive_timeout: int = 75,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(fail_on_error=fail_on_error)
|
||||
super().__init__(fail_on_error=fail_on_error, **kwargs)
|
||||
self.base_url = f"https://{region}.bunnycdn.com/{base_path}"
|
||||
self.headers = {
|
||||
"AccessKey": api_key,
|
||||
|
@ -12,3 +12,8 @@ dependencies = [
|
||||
"python-dotenv>=1.1.0",
|
||||
"wikitextparser>=0.56.3",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"pytest>=8.3.5",
|
||||
]
|
||||
|
1577
tests/fixtures/boston_input.txt
vendored
Normal file
1577
tests/fixtures/boston_input.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2905
tests/fixtures/boston_output.json
vendored
Normal file
2905
tests/fixtures/boston_output.json
vendored
Normal file
File diff suppressed because one or more lines are too long
333
tests/test_parser_json_snippets.py
Normal file
333
tests/test_parser_json_snippets.py
Normal file
@ -0,0 +1,333 @@
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
from transformers import WikivoyageParser
|
||||
|
||||
def dump(obj):
|
||||
# canonical JSON for deep compare
|
||||
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
|
||||
|
||||
def wrap(children):
|
||||
"""Wrap a list of child nodes in the default root envelope."""
|
||||
return {
|
||||
"type": "root",
|
||||
"properties": {},
|
||||
"children": children
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def parser():
|
||||
return WikivoyageParser()
|
||||
|
||||
def test_empty_input_is_root_only(parser):
|
||||
got = parser.parse("")
|
||||
assert dump(got) == dump(wrap([]))
|
||||
|
||||
def test_plain_text_node(parser):
|
||||
got = parser.parse("Just some plain text.")
|
||||
expected = wrap([
|
||||
{"type":"text","properties":{"markdown":"Just some plain text."},"children":[]}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_template_node(parser):
|
||||
got = parser.parse("{{foo|a=1|b=two}}")
|
||||
expected = wrap([
|
||||
{
|
||||
"type":"template",
|
||||
"properties":{"name":"foo","params":{"a":"1","b":"two"}},
|
||||
"children":[]
|
||||
}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_see_listing_full_properties(parser):
|
||||
snippet = (
|
||||
"{{see"
|
||||
"|name=Statue"
|
||||
"|alt=Monument"
|
||||
"|url=http://x"
|
||||
"|email=a@b.com"
|
||||
"|address=1 Road"
|
||||
"|lat=1.23"
|
||||
"|long=4.56"
|
||||
"|directions=North"
|
||||
"|phone=12345"
|
||||
"|tollfree=800"
|
||||
"|fax=54321"
|
||||
"|hours=24/7"
|
||||
"|price=Free"
|
||||
"|lastedit=2020-01-01"
|
||||
"|wikipedia=Statue"
|
||||
"|wikidata=Q1"
|
||||
"|content=Big statue"
|
||||
"}}"
|
||||
)
|
||||
got = parser.parse(snippet)
|
||||
expected = wrap([
|
||||
{
|
||||
"type":"see",
|
||||
"properties":{
|
||||
"name":"Statue","alt":"Monument","url":"http://x",
|
||||
"email":"a@b.com","address":"1 Road","lat":"1.23","long":"4.56",
|
||||
"directions":"North","phone":"12345","tollfree":"800",
|
||||
"fax":"54321","hours":"24/7","price":"Free",
|
||||
"lastedit":"2020-01-01","wikipedia":"Statue","wikidata":"Q1",
|
||||
"content":"Big statue"
|
||||
},
|
||||
"children":[]
|
||||
}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_do_listing_full_properties(parser):
|
||||
snippet = (
|
||||
"{{do"
|
||||
"|name=Walk"
|
||||
"|alt=Stroll"
|
||||
"|url=http://walk"
|
||||
"|email=hi@walk"
|
||||
"|address=Main Street"
|
||||
"|lat=2.34"
|
||||
"|long=5.67"
|
||||
"|directions=East"
|
||||
"|phone=222-333"
|
||||
"|tollfree=800-DO-WALK"
|
||||
"|fax=999-888"
|
||||
"|hours=All day"
|
||||
"|price=Free"
|
||||
"|lastedit=2021-02-02"
|
||||
"|wikipedia=Walking"
|
||||
"|wikidata=Q2"
|
||||
"|content=Enjoy a walk"
|
||||
"}}"
|
||||
)
|
||||
got = parser.parse(snippet)
|
||||
expected = wrap([
|
||||
{
|
||||
"type":"do",
|
||||
"properties":{
|
||||
"name":"Walk","alt":"Stroll","url":"http://walk",
|
||||
"email":"hi@walk","address":"Main Street","lat":"2.34","long":"5.67",
|
||||
"directions":"East","phone":"222-333","tollfree":"800-DO-WALK",
|
||||
"fax":"999-888","hours":"All day","price":"Free",
|
||||
"lastedit":"2021-02-02","wikipedia":"Walking","wikidata":"Q2",
|
||||
"content":"Enjoy a walk"
|
||||
},
|
||||
"children":[]
|
||||
}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_buy_listing_full_properties(parser):
|
||||
snippet = (
|
||||
"{{buy"
|
||||
"|name=Shirt"
|
||||
"|alt=Tees"
|
||||
"|url=http://shop"
|
||||
"|email=sales@shop"
|
||||
"|address=Market St"
|
||||
"|lat=3.45"
|
||||
"|long=6.78"
|
||||
"|directions=West"
|
||||
"|phone=444-555"
|
||||
"|tollfree=800-BUY-TEE"
|
||||
"|fax=777-666"
|
||||
"|hours=9–6"
|
||||
"|price=$20"
|
||||
"|lastedit=2022-03-03"
|
||||
"|wikipedia=Shopping"
|
||||
"|wikidata=Q3"
|
||||
"|content=Quality tees"
|
||||
"}}"
|
||||
)
|
||||
got = parser.parse(snippet)
|
||||
expected = wrap([
|
||||
{
|
||||
"type":"buy",
|
||||
"properties":{
|
||||
"name":"Shirt","alt":"Tees","url":"http://shop",
|
||||
"email":"sales@shop","address":"Market St","lat":"3.45","long":"6.78",
|
||||
"directions":"West","phone":"444-555","tollfree":"800-BUY-TEE",
|
||||
"fax":"777-666","hours":"9–6","price":"$20",
|
||||
"lastedit":"2022-03-03","wikipedia":"Shopping","wikidata":"Q3",
|
||||
"content":"Quality tees"
|
||||
},
|
||||
"children":[]
|
||||
}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_eat_listing_full_properties(parser):
|
||||
snippet = (
|
||||
"{{eat"
|
||||
"|name=Diner"
|
||||
"|alt=Cafe"
|
||||
"|url=http://eat"
|
||||
"|email=food@eat"
|
||||
"|address=Food Lane"
|
||||
"|lat=4.56"
|
||||
"|long=7.89"
|
||||
"|directions=South"
|
||||
"|phone=666-777"
|
||||
"|tollfree=800-EAT-YUM"
|
||||
"|fax=555-444"
|
||||
"|hours=Breakfast"
|
||||
"|price=$10–$30"
|
||||
"|lastedit=2023-04-04"
|
||||
"|wikipedia=Dining"
|
||||
"|wikidata=Q4"
|
||||
"|content=Best pancakes"
|
||||
"}}"
|
||||
)
|
||||
got = parser.parse(snippet)
|
||||
expected = wrap([
|
||||
{
|
||||
"type":"eat",
|
||||
"properties":{
|
||||
"name":"Diner","alt":"Cafe","url":"http://eat",
|
||||
"email":"food@eat","address":"Food Lane","lat":"4.56","long":"7.89",
|
||||
"directions":"South","phone":"666-777","tollfree":"800-EAT-YUM",
|
||||
"fax":"555-444","hours":"Breakfast","price":"$10–$30",
|
||||
"lastedit":"2023-04-04","wikipedia":"Dining","wikidata":"Q4",
|
||||
"content":"Best pancakes"
|
||||
},
|
||||
"children":[]
|
||||
}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_drink_listing_full_properties(parser):
|
||||
snippet = (
|
||||
"{{drink"
|
||||
"|name=Pub"
|
||||
"|alt=Bar"
|
||||
"|url=http://drink"
|
||||
"|email=cheers@drink"
|
||||
"|address=Bar Street"
|
||||
"|lat=5.67"
|
||||
"|long=8.90"
|
||||
"|directions=Center"
|
||||
"|phone=888-999"
|
||||
"|tollfree=800-DRINK"
|
||||
"|fax=333-222"
|
||||
"|hours=Evening"
|
||||
"|price=$7–$30"
|
||||
"|lastedit=2024-05-05"
|
||||
"|wikipedia=Nightlife"
|
||||
"|wikidata=Q5"
|
||||
"|content=Great brews"
|
||||
"}}"
|
||||
)
|
||||
got = parser.parse(snippet)
|
||||
expected = wrap([
|
||||
{
|
||||
"type":"drink",
|
||||
"properties":{
|
||||
"name":"Pub","alt":"Bar","url":"http://drink",
|
||||
"email":"cheers@drink","address":"Bar Street","lat":"5.67","long":"8.90",
|
||||
"directions":"Center","phone":"888-999","tollfree":"800-DRINK",
|
||||
"fax":"333-222","hours":"Evening","price":"$7–$30",
|
||||
"lastedit":"2024-05-05","wikipedia":"Nightlife","wikidata":"Q5",
|
||||
"content":"Great brews"
|
||||
},
|
||||
"children":[]
|
||||
}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_sleep_listing_full_properties(parser):
|
||||
snippet = (
|
||||
"{{sleep"
|
||||
"|name=Hotel"
|
||||
"|alt=Inn"
|
||||
"|url=http://sleep"
|
||||
"|email=stay@sleep"
|
||||
"|address=Sleepy Ave"
|
||||
"|lat=6.78"
|
||||
"|long=9.01"
|
||||
"|directions=Uptown"
|
||||
"|phone=000-111"
|
||||
"|tollfree=800-SLEEP"
|
||||
"|fax=111-000"
|
||||
"|hours=24h"
|
||||
"|price=$100"
|
||||
"|lastedit=2025-06-06"
|
||||
"|wikipedia=Accommodation"
|
||||
"|wikidata=Q6"
|
||||
"|checkin=3PM"
|
||||
"|checkout=11AM"
|
||||
"|content=Cozy rooms"
|
||||
"}}"
|
||||
)
|
||||
got = parser.parse(snippet)
|
||||
expected = wrap([
|
||||
{
|
||||
"type":"sleep",
|
||||
"properties":{
|
||||
"name":"Hotel","alt":"Inn","url":"http://sleep",
|
||||
"email":"stay@sleep","address":"Sleepy Ave","lat":"6.78","long":"9.01",
|
||||
"directions":"Uptown","phone":"000-111","tollfree":"800-SLEEP",
|
||||
"fax":"111-000","hours":"24h","price":"$100",
|
||||
"lastedit":"2025-06-06","wikipedia":"Accommodation","wikidata":"Q6",
|
||||
"checkin":"3PM","checkout":"11AM","content":"Cozy rooms"
|
||||
},
|
||||
"children":[]
|
||||
}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_generic_listing_full_properties(parser):
|
||||
snippet = (
|
||||
"{{listing"
|
||||
"|name=Info"
|
||||
"|alt=Data"
|
||||
"|url=http://info"
|
||||
"|email=info@info"
|
||||
"|address=Down St"
|
||||
"|lat=7.89"
|
||||
"|long=0.12"
|
||||
"|directions=Here"
|
||||
"|phone=123-000"
|
||||
"|tollfree=800-INFO"
|
||||
"|fax=000-123"
|
||||
"|hours=All times"
|
||||
"|price=$0"
|
||||
"|lastedit=2026-07-07"
|
||||
"|wikipedia=InfoPage"
|
||||
"|wikidata=Q7"
|
||||
"|content=Useful info"
|
||||
"}}"
|
||||
)
|
||||
got = parser.parse(snippet)
|
||||
expected = wrap([
|
||||
{
|
||||
"type":"listing",
|
||||
"properties":{
|
||||
"name":"Info","alt":"Data","url":"http://info",
|
||||
"email":"info@info","address":"Down St","lat":"7.89","long":"0.12",
|
||||
"directions":"Here","phone":"123-000","tollfree":"800-INFO",
|
||||
"fax":"000-123","hours":"All times","price":"$0",
|
||||
"lastedit":"2026-07-07","wikipedia":"InfoPage","wikidata":"Q7",
|
||||
"content":"Useful info"
|
||||
},
|
||||
"children":[]
|
||||
}
|
||||
])
|
||||
assert dump(got) == dump(expected)
|
||||
|
||||
def test_section_and_subsection(parser):
|
||||
got = parser.parse("Intro\n== First ==\nHello\n=== Sub ===\nWorld")
|
||||
sec = got["children"][1]
|
||||
assert sec["type"] == "section" and sec["properties"]["level"] == 2
|
||||
sub = sec["children"][1]
|
||||
assert sub["type"] == "section" and sub["properties"]["level"] == 3
|
||||
|
||||
def test_full_boston_snapshot(parser):
|
||||
here = os.path.dirname(__file__)
|
||||
inp = os.path.join(here, "fixtures", "boston_input.txt")
|
||||
out = os.path.join(here, "fixtures", "boston_output.json")
|
||||
wikicode = open(inp, encoding="utf-8").read()
|
||||
expected = json.load(open(out, encoding="utf-8"))
|
||||
got = parser.parse(wikicode)
|
||||
assert dump(got) == dump(expected)
|
@ -1,246 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import zlib
|
||||
import bz2
|
||||
import asyncio
|
||||
import logging
|
||||
import importlib
|
||||
import xml.sax
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import aiohttp
|
||||
from parser import WikivoyageParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def gather_handler_kwargs(handler_name: str) -> dict:
|
||||
"""
|
||||
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
|
||||
E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
|
||||
"""
|
||||
prefix = f"HANDLER_{handler_name.upper()}_"
|
||||
kwargs = {}
|
||||
|
||||
for env_key, val in os.environ.items():
|
||||
if not env_key.startswith(prefix):
|
||||
continue
|
||||
param = env_key.replace(prefix, "").lower()
|
||||
# cast ints
|
||||
if val.isdigit():
|
||||
val = int(val)
|
||||
# cast bools
|
||||
elif val.lower() in ("true", "false"):
|
||||
val = val.lower() == "true"
|
||||
kwargs[param] = val
|
||||
logger.debug(f"Handler kwargs: {kwargs}")
|
||||
return kwargs
|
||||
|
||||
async def fetch_mappings() -> dict[str, str]:
|
||||
"""
|
||||
Download and gunzip the page_props SQL dump, extract
|
||||
page→wikibase_item mappings.
|
||||
"""
|
||||
sql_url = (
|
||||
"https://dumps.wikimedia.org/"
|
||||
"enwikivoyage/latest/"
|
||||
"enwikivoyage-latest-page_props.sql.gz"
|
||||
)
|
||||
# decompress gzip
|
||||
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||
# regex for tuples: (page,'prop','value',NULL_or_number)
|
||||
tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
|
||||
buffer = ""
|
||||
mappings: dict[str, str] = {}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(sql_url) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||
data = decomp.decompress(chunk)
|
||||
if not data:
|
||||
continue
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
buffer += text
|
||||
for m in tuple_re.finditer(buffer):
|
||||
page_id, prop, value = m.group(1), m.group(2), m.group(3)
|
||||
if prop == "wikibase_item":
|
||||
mappings[page_id] = value
|
||||
# keep tail to handle split tuples
|
||||
if len(buffer) > 1000:
|
||||
buffer = buffer[-1000:]
|
||||
return mappings
|
||||
|
||||
class WikiDumpHandler(xml.sax.ContentHandler):
|
||||
"""
|
||||
SAX handler that, for each <page> whose <id> is in mappings,
|
||||
collects the <text> and schedules an async task to parse
|
||||
and write via the user‐supplied handler.
|
||||
"""
|
||||
|
||||
def __init__(self, mappings, handler, max_concurrent):
|
||||
super().__init__()
|
||||
self.mappings = mappings
|
||||
self.handler = handler
|
||||
self.sem = (
|
||||
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
|
||||
)
|
||||
self.tasks: list[asyncio.Task] = []
|
||||
|
||||
self.currentTag: str | None = None
|
||||
self.inPage = False
|
||||
self.inRevision = False
|
||||
self.inText = False
|
||||
self.currentPageId: str | None = None
|
||||
self.currentText: list[str] = []
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
self.currentTag = name
|
||||
if name == "page":
|
||||
self.inPage = True
|
||||
self.currentPageId = None
|
||||
self.currentText = []
|
||||
elif name == "revision":
|
||||
self.inRevision = True
|
||||
elif name == "text" and self.inRevision:
|
||||
self.inText = True
|
||||
|
||||
def endElement(self, name):
|
||||
if name == "page":
|
||||
pid = self.currentPageId
|
||||
if pid and pid in self.mappings:
|
||||
wd_id = self.mappings[pid]
|
||||
text = "".join(self.currentText)
|
||||
# schedule processing
|
||||
if self.sem:
|
||||
task = asyncio.create_task(self._bounded_process(text, wd_id))
|
||||
else:
|
||||
task = asyncio.create_task(self._process(text, wd_id))
|
||||
self.tasks.append(task)
|
||||
# reset
|
||||
self.inPage = self.inRevision = self.inText = False
|
||||
self.currentPageId = None
|
||||
self.currentText = []
|
||||
elif name == "revision":
|
||||
self.inRevision = False
|
||||
elif name == "text":
|
||||
self.inText = False
|
||||
self.currentTag = None
|
||||
|
||||
def characters(self, content):
|
||||
# Only filter whitespace for ID fields, preserve all content for text
|
||||
if (
|
||||
self.currentTag == "id"
|
||||
and self.inPage
|
||||
and not self.inRevision
|
||||
and not self.currentPageId
|
||||
):
|
||||
content_stripped = content.strip()
|
||||
if content_stripped: # Only process non-empty ID content
|
||||
self.currentPageId = content_stripped
|
||||
elif self.inText:
|
||||
# Always append text content, even if it's just whitespace or newlines
|
||||
self.currentText.append(content)
|
||||
|
||||
async def _process(self, text: str, uid: str):
|
||||
parser = WikivoyageParser()
|
||||
entry = parser.parse(text)
|
||||
await self.handler.write_entry(entry, uid)
|
||||
|
||||
async def _bounded_process(self, text: str, uid: str):
|
||||
# Only run N at once
|
||||
async with self.sem:
|
||||
await self._process(text, uid)
|
||||
|
||||
async def process_dump(
|
||||
mappings: dict[str, str], handler, max_concurrent: int
|
||||
):
|
||||
"""
|
||||
Stream-download the bzip2-compressed XML dump and feed to SAX.
|
||||
"""
|
||||
xml_url = (
|
||||
"https://dumps.wikimedia.org/"
|
||||
"enwikivoyage/latest/"
|
||||
"enwikivoyage-latest-pages-articles.xml.bz2"
|
||||
)
|
||||
decomp = bz2.BZ2Decompressor()
|
||||
sax_parser = xml.sax.make_parser()
|
||||
dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
|
||||
sax_parser.setContentHandler(dump_handler)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(xml_url) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||
data = decomp.decompress(chunk)
|
||||
if not data:
|
||||
continue
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
sax_parser.feed(text)
|
||||
sax_parser.close()
|
||||
if dump_handler.tasks:
|
||||
await asyncio.gather(*dump_handler.tasks)
|
||||
|
||||
async def main():
|
||||
# 1. Which handler to load?
|
||||
handler_name = os.getenv("HANDLER")
|
||||
if not handler_name:
|
||||
logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
|
||||
sys.exit(1)
|
||||
|
||||
# 2. Dynamic import
|
||||
module_path = f"output_handlers.{handler_name}"
|
||||
try:
|
||||
mod = importlib.import_module(module_path)
|
||||
except ImportError as e:
|
||||
logger.error(f"Error loading handler module {module_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# 3. Find the class: e.g. "sftp" → "SftpHandler"
|
||||
class_name = handler_name.title().replace("_", "") + "Handler"
|
||||
if not hasattr(mod, class_name):
|
||||
logger.error(f"{module_path} defines no class {class_name}")
|
||||
sys.exit(1)
|
||||
HandlerCls = getattr(mod, class_name)
|
||||
|
||||
logger.info(f"Using handler from {module_path}")
|
||||
|
||||
# 4. Build kwargs from ENV
|
||||
handler_kwargs = gather_handler_kwargs(handler_name)
|
||||
|
||||
# 5. Instantiate
|
||||
handler = HandlerCls(**handler_kwargs)
|
||||
|
||||
# 6. read concurrency setting
|
||||
try:
|
||||
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
|
||||
except ValueError:
|
||||
raise ValueError("MAX_CONCURRENT must be an integer")
|
||||
|
||||
if max_conc < 0:
|
||||
raise ValueError("MAX_CONCURRENT must be >= 0")
|
||||
|
||||
|
||||
# 7. Fetch mappings
|
||||
logger.info("Fetching mappings from SQL dump…")
|
||||
mappings = await fetch_mappings()
|
||||
logger.info(f"Got {len(mappings)} wikibase_item mappings.")
|
||||
|
||||
# 8. Stream & split the XML dump
|
||||
logger.info("Processing XML dump…")
|
||||
await process_dump(mappings, handler, max_conc)
|
||||
|
||||
# 5. Finish up
|
||||
await handler.close()
|
||||
logger.info("All done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
if os.getenv("DEBUG"):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
asyncio.run(main())
|
3
transformers/__init__.py
Normal file
3
transformers/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from .fetch_mappings import fetch_mappings
|
||||
from .wiki_dump_handler import WikiDumpHandler
|
||||
from .parser import WikivoyageParser
|
42
transformers/fetch_mappings.py
Normal file
42
transformers/fetch_mappings.py
Normal file
@ -0,0 +1,42 @@
|
||||
from logging import getLogger
|
||||
import zlib
|
||||
import re
|
||||
import aiohttp
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
async def fetch_mappings() -> dict[str, str]:
|
||||
"""
|
||||
Download and gunzip the page_props SQL dump, extract
|
||||
page→wikibase_item mappings.
|
||||
"""
|
||||
sql_url = (
|
||||
"https://dumps.wikimedia.org/"
|
||||
"enwikivoyage/latest/"
|
||||
"enwikivoyage-latest-page_props.sql.gz"
|
||||
)
|
||||
# decompress gzip
|
||||
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||
# regex for tuples: (page,'prop','value',NULL_or_number)
|
||||
tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
|
||||
buffer = ""
|
||||
mappings: dict[str, str] = {}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(sql_url) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_chunked(1024 * 1024):
|
||||
data = decomp.decompress(chunk)
|
||||
if not data:
|
||||
continue
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
buffer += text
|
||||
for m in tuple_re.finditer(buffer):
|
||||
page_id, prop, value = m.group(1), m.group(2), m.group(3)
|
||||
if prop == "wikibase_item":
|
||||
logger.debug(f"Found mapping {page_id} -> {value}")
|
||||
mappings[page_id] = value
|
||||
# keep tail to handle split tuples
|
||||
if len(buffer) > 1000:
|
||||
buffer = buffer[-1000:]
|
||||
return mappings
|
100
transformers/wiki_dump_handler.py
Normal file
100
transformers/wiki_dump_handler.py
Normal file
@ -0,0 +1,100 @@
|
||||
from logging import getLogger
|
||||
import xml.sax
|
||||
import asyncio
|
||||
from .parser import WikivoyageParser
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
class WikiDumpHandler(xml.sax.ContentHandler):
|
||||
"""
|
||||
SAX handler that, for each <page> whose <id> is in mappings,
|
||||
collects the <text> and schedules an async task to parse
|
||||
and write via the user‐supplied handler(s).
|
||||
"""
|
||||
|
||||
def __init__(self, mappings, handlers):
|
||||
super().__init__()
|
||||
self.mappings = mappings
|
||||
# Support a single handler or a list of handlers
|
||||
self.handlers = handlers
|
||||
self.tasks: list[asyncio.Task] = []
|
||||
|
||||
self.currentTag: str | None = None
|
||||
self.inPage = False
|
||||
self.inRevision = False
|
||||
self.inText = False
|
||||
self.currentPageId: str | None = None
|
||||
self.currentTitle: str | None = None
|
||||
self.currentText: list[str] = []
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
self.currentTag = name
|
||||
if name == "page":
|
||||
logger.debug("start page")
|
||||
self.inPage = True
|
||||
self.currentPageId = None
|
||||
self.currentTitle = None
|
||||
self.currentText = []
|
||||
elif name == "revision":
|
||||
logger.debug("start revision")
|
||||
self.inRevision = True
|
||||
elif name == "text" and self.inRevision:
|
||||
logger.debug("start text")
|
||||
self.inText = True
|
||||
|
||||
def endElement(self, name):
|
||||
if name == "page":
|
||||
logger.debug("end page")
|
||||
pid = self.currentPageId
|
||||
if pid and pid in self.mappings:
|
||||
wd_id = self.mappings[pid]
|
||||
text = "".join(self.currentText)
|
||||
title = self.currentTitle
|
||||
logger.debug(f"scheduled {wd_id} for handling")
|
||||
# schedule processing
|
||||
task = asyncio.create_task(self._process(text, wd_id, title))
|
||||
self.tasks.append(task)
|
||||
else:
|
||||
logger.debug(f"page {pid} without wikidata id, skipping...")
|
||||
# reset
|
||||
self.inPage = self.inRevision = self.inText = False
|
||||
self.currentPageId = None
|
||||
self.currentTitle = None
|
||||
self.currentText = []
|
||||
elif name == "revision":
|
||||
logger.debug("end revision")
|
||||
self.inRevision = False
|
||||
elif name == "text":
|
||||
logger.debug("end text")
|
||||
self.inText = False
|
||||
self.currentTag = None
|
||||
|
||||
def characters(self, content):
|
||||
# Only filter whitespace for ID fields, preserve all content for text
|
||||
if (
|
||||
self.currentTag == "id"
|
||||
and self.inPage
|
||||
and not self.inRevision
|
||||
and not self.currentPageId
|
||||
):
|
||||
content_stripped = content.strip()
|
||||
if content_stripped: # Only process non-empty ID content
|
||||
self.currentPageId = content_stripped
|
||||
elif self.currentTag == "title" and self.inPage:
|
||||
if self.currentTitle is None:
|
||||
self.currentTitle = content
|
||||
else:
|
||||
self.currentTitle += content
|
||||
elif self.inText:
|
||||
# Always append text content, even if it's just whitespace or newlines
|
||||
self.currentText.append(content)
|
||||
|
||||
async def _process(self, text: str, uid: str, title: str):
|
||||
parser = WikivoyageParser()
|
||||
entry = parser.parse(text)
|
||||
entry['properties']['title'] = title
|
||||
|
||||
# Write to all handlers concurrently
|
||||
await asyncio.gather(*[
|
||||
handler.write_entry(entry, uid) for handler in self.handlers
|
||||
])
|
21
types/LICENSE
Normal file
21
types/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 bcye and moll-re
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
3
types/README.md
Normal file
3
types/README.md
Normal file
@ -0,0 +1,3 @@
|
||||
# @bcye/structured-wikivoyage-types
|
||||
|
||||
Types to use when consuming json trees from the structured-wikivoyage-exports project
|
29
types/package.json
Normal file
29
types/package.json
Normal file
@ -0,0 +1,29 @@
|
||||
{
|
||||
"name": "@bcye/structured-wikivoyage-types",
|
||||
"version": "0.2.5",
|
||||
"description": "Types to use when consuming json trees from the structured-wikivoyage-exports project",
|
||||
"keywords": [],
|
||||
"contributors": [
|
||||
"bcye",
|
||||
"moll-re"
|
||||
],
|
||||
"license": "MIT",c
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/bcye/structured-wikivoyage-exports.git"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/bcye/structured-wikivoyage-exports/issues"
|
||||
},
|
||||
"homepage": "https://github.com/bcye/structured-wikivoyage-exports#readme",
|
||||
"files": [
|
||||
"dist/index.d.ts",
|
||||
"dist/index.js"
|
||||
],
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"private": false,
|
||||
"devDependencies": {
|
||||
"typescript": "^5.8.3"
|
||||
}
|
||||
}
|
24
types/pnpm-lock.yaml
generated
Normal file
24
types/pnpm-lock.yaml
generated
Normal file
@ -0,0 +1,24 @@
|
||||
lockfileVersion: '9.0'
|
||||
|
||||
settings:
|
||||
autoInstallPeers: true
|
||||
excludeLinksFromLockfile: false
|
||||
|
||||
importers:
|
||||
|
||||
.:
|
||||
devDependencies:
|
||||
typescript:
|
||||
specifier: ^5.8.3
|
||||
version: 5.8.3
|
||||
|
||||
packages:
|
||||
|
||||
typescript@5.8.3:
|
||||
resolution: {integrity: sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==}
|
||||
engines: {node: '>=14.17'}
|
||||
hasBin: true
|
||||
|
||||
snapshots:
|
||||
|
||||
typescript@5.8.3: {}
|
113
types/tsconfig.json
Normal file
113
types/tsconfig.json
Normal file
@ -0,0 +1,113 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
/* Visit https://aka.ms/tsconfig to read more about this file */
|
||||
|
||||
/* Projects */
|
||||
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
|
||||
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
|
||||
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
|
||||
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
|
||||
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
|
||||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||
|
||||
/* Language and Environment */
|
||||
"target": "es2016" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
|
||||
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||
// "libReplacement": true, /* Enable lib replacement. */
|
||||
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
|
||||
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
|
||||
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
|
||||
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
|
||||
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
|
||||
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
|
||||
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
|
||||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||
|
||||
/* Modules */
|
||||
"module": "commonjs" /* Specify what module code is generated. */,
|
||||
// "rootDir": "./", /* Specify the root folder within your source files. */
|
||||
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
|
||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
||||
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
|
||||
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
|
||||
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
|
||||
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
|
||||
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
|
||||
// "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
|
||||
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
|
||||
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
|
||||
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
|
||||
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
|
||||
// "resolveJsonModule": true, /* Enable importing .json files. */
|
||||
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
|
||||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||
|
||||
/* JavaScript Support */
|
||||
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
||||
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
||||
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||
|
||||
/* Emit */
|
||||
"declaration": true /* Generate .d.ts files from TypeScript and JavaScript files in your project. */,
|
||||
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
|
||||
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
|
||||
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||
"outDir": "./dist/" /* Specify an output folder for all emitted files. */,
|
||||
// "removeComments": true, /* Disable emitting comments. */
|
||||
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
|
||||
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
|
||||
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
|
||||
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
|
||||
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
|
||||
// "newLine": "crlf", /* Set the newline character for emitting files. */
|
||||
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
|
||||
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
|
||||
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
|
||||
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
|
||||
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
|
||||
|
||||
/* Interop Constraints */
|
||||
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
|
||||
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
|
||||
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
|
||||
// "erasableSyntaxOnly": true, /* Do not allow runtime constructs that are not part of ECMAScript. */
|
||||
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||
"esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */,
|
||||
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||
"forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
|
||||
|
||||
/* Type Checking */
|
||||
"strict": true /* Enable all strict type-checking options. */,
|
||||
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
|
||||
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
|
||||
// "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
|
||||
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
|
||||
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
|
||||
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
|
||||
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
|
||||
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
|
||||
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
|
||||
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
|
||||
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
|
||||
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
|
||||
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
|
||||
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
|
||||
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
|
||||
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
|
||||
|
||||
/* Completeness */
|
||||
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||
}
|
||||
}
|
59
uv.lock
generated
59
uv.lock
generated
@ -135,6 +135,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "44.0.2"
|
||||
@ -239,6 +248,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.1.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mapvoyage-extract"
|
||||
version = "0.1.0"
|
||||
@ -252,6 +270,11 @@ dependencies = [
|
||||
{ name = "wikitextparser" },
|
||||
]
|
||||
|
||||
[package.dev-dependencies]
|
||||
dev = [
|
||||
{ name = "pytest" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "aiofiles", specifier = ">=24.1.0" },
|
||||
@ -262,6 +285,9 @@ requires-dist = [
|
||||
{ name = "wikitextparser", specifier = ">=0.56.3" },
|
||||
]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [{ name = "pytest", specifier = ">=8.3.5" }]
|
||||
|
||||
[[package]]
|
||||
name = "multidict"
|
||||
version = "6.4.3"
|
||||
@ -335,6 +361,24 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "25.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.5.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "propcache"
|
||||
version = "0.3.1"
|
||||
@ -401,6 +445,21 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "8.3.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
{ name = "iniconfig" },
|
||||
{ name = "packaging" },
|
||||
{ name = "pluggy" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dotenv"
|
||||
version = "1.1.0"
|
||||
|
Loading…
x
Reference in New Issue
Block a user