import mwparserfromhell as mwp import mwparserfromhell.nodes as nodes import re import json from typing import Dict, List, Any, Optional, Union, Tuple import os from pathlib import Path import sys class WikivoyageParser: def __init__(self): self.document_templates = [ "pagebanner", "mapframe", "routebox", "geo", "isPartOf", "usablecity", "guidecity", "outlinecity" ] self.listing_templates = ["see", "do", "buy", "eat", "drink", "sleep", "listing"] self.root = { "type": "root", "properties": {}, "children": [] } self.current_section = self.root def parse(self, wikitext: str) -> Dict: """Parse wikitext and return structured JSON tree""" self.root = { "type": "root", "properties": {}, "children": [] } self.current_section = self.root # Parse the wikitext parsed = mwp.parse(wikitext) # Process the parsed content self._process_nodes(parsed) return self.root def _process_nodes(self, wikicode): """Process all nodes in the wikicode""" current_text = "" for node in wikicode.nodes: # Handle different node types if isinstance(node, nodes.heading.Heading): # First flush any pending text if current_text: self._add_text_node(current_text) current_text = "" # Create new section self._handle_heading(node) elif isinstance(node, nodes.template.Template): # First flush any pending text if current_text: self._add_text_node(current_text) current_text = "" # Handle template self._handle_template(node) elif isinstance(node, nodes.text.Text): # Accumulate text current_text += str(node.value) elif isinstance(node, nodes.tag.Tag): # Handle tag (potential styling) tag_text = self._convert_tag_to_markdown(node) current_text += tag_text elif isinstance(node, nodes.wikilink.Wikilink): # Handle wikilink link_text = self._convert_wikilink_to_markdown(node) current_text += link_text elif isinstance(node, nodes.external_link.ExternalLink): # Handle external link link_text = self._convert_external_link_to_markdown(node) current_text += link_text elif isinstance(node, nodes.comment.Comment): # Skip comments pass else: # Process other nodes as text current_text += str(node) # Add any remaining text if current_text: self._add_text_node(current_text) def _add_text_node(self, text: str): """Add a text node to the current section""" # Avoid adding empty text nodes if not text.strip(): return text_node = { "type": "text", "properties": { "markdown": text.strip() }, "children": [] } self.current_section["children"].append(text_node) def _handle_heading(self, heading_node): """Handle a heading node by creating a new section""" level = heading_node.level title = str(heading_node.title).strip() # Create new section node section = { "type": "section", "properties": { "title": title, "level": level }, "children": [] } # Find the appropriate parent section based on level parent = self.root # If the level is 1, the parent is the root if level > 1: # Start from root and traverse the tree current = self.root current_level = 0 for child in reversed(self._get_all_sections()): child_level = child["properties"]["level"] if child_level < level: parent = child break # Add the section to its parent parent["children"].append(section) # Update current section self.current_section = section def _get_all_sections(self): """Get all sections in the document in the order they appear""" sections = [] def collect_sections(node): if node["type"] == "section": sections.append(node) for child in node["children"]: if child["type"] == "section": collect_sections(child) collect_sections(self.root) return sections def _handle_template(self, template_node): """Handle a template node""" template_name = str(template_node.name).strip().lower() # Check if it's a document-wide template if template_name in self.document_templates: self._handle_document_template(template_node) return # Check if it's a listing template if template_name in self.listing_templates: self._handle_listing_template(template_node) return # Handle other templates as regular nodes self._handle_other_template(template_node) def _handle_document_template(self, template_node): """Handle document-wide templates by adding to root properties""" template_name = str(template_node.name).strip().lower() # Extract parameters params = {} for param in template_node.params: name = str(param.name).strip() value = str(param.value).strip() params[name] = value # Add to root properties if template_name not in self.root["properties"]: self.root["properties"][template_name] = {} self.root["properties"][template_name] = params def _handle_listing_template(self, template_node): """Handle listing templates (see, do, buy, eat, drink, sleep)""" template_name = str(template_node.name).strip().lower() # Extract parameters properties = {} for param in template_node.params: name = str(param.name).strip() value = str(param.value).strip() # Convert content to markdown if it's in the 'content' parameter if name == "content": value = self._convert_wikicode_to_markdown(param.value) properties[name] = value # Create listing node listing_node = { "type": template_name, "properties": properties, "children": [] } # Add to current section self.current_section["children"].append(listing_node) def _handle_other_template(self, template_node): """Handle other templates as general template nodes""" template_name = str(template_node.name).strip().lower() # Extract parameters properties = { "name": template_name, "params": {} } for param in template_node.params: name = str(param.name).strip() value = str(param.value).strip() properties["params"][name] = value # Create template node template_node = { "type": "template", "properties": properties, "children": [] } # Add to current section self.current_section["children"].append(template_node) def _convert_wikicode_to_markdown(self, wikicode) -> str: """Convert wikicode to markdown""" markdown = "" for node in wikicode.nodes: if isinstance(node, nodes.text.Text): markdown += str(node.value) elif isinstance(node, nodes.tag.Tag): markdown += self._convert_tag_to_markdown(node) elif isinstance(node, nodes.wikilink.Wikilink): markdown += self._convert_wikilink_to_markdown(node) elif isinstance(node, nodes.external_link.ExternalLink): markdown += self._convert_external_link_to_markdown(node) else: # For other nodes, just use their string representation markdown += str(node) return markdown.strip() def _convert_tag_to_markdown(self, tag_node) -> str: """Convert HTML tag to markdown""" tag = str(tag_node.tag).lower() content = str(tag_node.contents) # Convert the content recursively to handle nested tags if tag_node.contents: content = self._convert_wikicode_to_markdown(tag_node.contents) # Handle different tags if tag == 'b' or tag == 'strong': return f"**{content}**" elif tag == 'i' or tag == 'em': return f"*{content}*" elif tag == 'u': return f"_{content}_" elif tag == 'strike' or tag == 's' or tag == 'del': return f"~~{content}~~" elif tag == 'code': return f"`{content}`" elif tag == 'pre': return f"```\n{content}\n```" elif tag == 'br': return "\n" elif tag == 'hr': return "\n---\n" elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: level = int(tag[1]) return f"\n{'#' * level} {content}\n" elif tag == 'a': href = "" for attr in tag_node.attributes: if str(attr.name).lower() == 'href': href = str(attr.value) break return f"[{content}]({href})" elif tag == 'img': src = alt = "" for attr in tag_node.attributes: if str(attr.name).lower() == 'src': src = str(attr.value) elif str(attr.name).lower() == 'alt': alt = str(attr.value) return f"![{alt}]({src})" else: # For unknown tags, just return the content return content def _convert_wikilink_to_markdown(self, wikilink_node) -> str: """Convert wikilink to markdown""" title = str(wikilink_node.title) if wikilink_node.text: text = str(wikilink_node.text) return f"[{text}]({title})" else: return f"[{title}]({title})" def _convert_external_link_to_markdown(self, link_node) -> str: """Convert external link to markdown""" url = str(link_node.url) if link_node.title: title = str(link_node.title) return f"[{title}]({url})" else: return url def export_json(self, root=None, indent=2) -> str: """Export the tree as JSON string""" if root is None: root = self.root return json.dumps(root, indent=indent) def process_file(input_file: Path, parser: WikivoyageParser) -> None: """Process a single wiki file and save JSON output""" # Create output path with .json extension output_file = input_file.with_suffix('.json') # Ensure output directory exists output_file.parent.mkdir(parents=True, exist_ok=True) try: # Read and parse input file with open(input_file, 'r', encoding='utf-8') as f: wikitext = f.read() result = parser.parse(wikitext) # Write JSON output with open(output_file, 'w', encoding='utf-8') as f: f.write(parser.export_json()) except Exception as e: print(f"Error processing {input_file}: {e}") def main(): # Initialize parser once for reuse parser = WikivoyageParser() # Get input directory from command line or use current directory input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.') # Process all .txt files recursively for txt_file in input_dir.rglob('*.txt'): print(f"Processing {txt_file}") process_file(txt_file, parser) print("Processing complete") if __name__ == "__main__": main()