Source code for ssmd.ssml_parser

"""SSML to SSMD converter - reverse conversion."""

import re
import xml.etree.ElementTree as ET
from typing import TYPE_CHECKING, Any

from ssmd.formatter import format_ssmd
from ssmd.parser import parse_sentences
from ssmd.ssml_conversions import SSML_BREAK_STRENGTH_MAP
from ssmd.utils import format_ssmd_attr

if TYPE_CHECKING:
    from ssmd.capabilities import TTSCapabilities


[docs] class SSMLParser: """Convert SSML to SSMD markdown format. This class provides the reverse conversion from SSML XML to the more human-readable SSMD markdown syntax. Example: >>> parser = SSMLParser() >>> ssml = '<speak><emphasis>Hello</emphasis> world</speak>' >>> ssmd = parser.to_ssmd(ssml) >>> print(ssmd) '*Hello* world' """ # Standard locales that can be simplified (locale -> language code) STANDARD_LOCALES = { "en-US": "en", "en-GB": "en-GB", # Keep non-US English locales "de-DE": "de", "fr-FR": "fr", "es-ES": "es", "it-IT": "it", "pt-PT": "pt", "ru-RU": "ru", "zh-CN": "zh", "ja-JP": "ja", "ko-KR": "ko", }
[docs] def __init__(self, config: dict[str, Any] | None = None): """Initialize SSML parser. Args: config: Optional configuration dictionary """ self.config = config or {}
def _format_attr(self, key: str, value: str) -> str: return format_ssmd_attr(key, value) def _format_attrs(self, pairs: list[tuple[str, str]]) -> str: return " ".join(self._format_attr(key, value) for key, value in pairs) def _wrap_directive(self, content: str, attrs: str) -> str: content = content.strip() return f"<div {attrs}>{{DIRECTIVE_NEWLINE}}{content}{{DIRECTIVE_NEWLINE}}</div>" def _element_namespace(self, element: ET.Element) -> str | None: if element.tag.startswith("{"): return element.tag.split("}")[0][1:] return None
[docs] def to_ssmd( self, ssml: str, *, capabilities: "TTSCapabilities | str | None" = None ) -> str: """Convert SSML to SSMD format. Args: ssml: SSML XML string capabilities: Optional TTS capabilities (preset name or object) Returns: SSMD markdown string with proper formatting (each sentence on new line) Example: >>> parser = SSMLParser() >>> parser.to_ssmd('<speak><emphasis>Hello</emphasis></speak>') '*Hello*' """ # Wrap in <speak> if not already wrapped if not ssml.strip().startswith("<speak"): ssml = f"<speak>{ssml}</speak>" # Register common SSML namespaces try: ET.register_namespace("amazon", "https://amazon.com/ssml") except Exception: pass # Namespace might already be registered try: root = ET.fromstring(ssml) except ET.ParseError as e: raise ValueError(f"Invalid SSML XML: {e}") from e # Process the root element result = self._process_element(root) # Clean up whitespace result = self._clean_whitespace(result) # Restore directive and sentence newlines (protected during whitespace cleaning) result = ( result.replace("{DIRECTIVE_NEWLINE}", "\n") .replace("{SENTENCE_NEWLINE}", "\n") .strip() ) # Parse into sentences and format with proper line breaks sentences = parse_sentences( result.strip(), capabilities=capabilities, strict_parse=capabilities is not None, ) return format_ssmd(sentences)
def _process_element(self, element: ET.Element) -> str: """Process an XML element and its children recursively. Args: element: XML element to process Returns: SSMD formatted string """ tag = element.tag.split("}")[-1] # Remove namespace if present namespace = self._element_namespace(element) # Handle different SSML tags if tag == "speak": return self._process_children(element) elif tag == "p": content = self._process_children(element) # Paragraphs are separated by double newlines return f"{content}\n\n" elif tag == "s": # Sentences - preserve explicit line breaks return f"{self._process_children(element)}{{SENTENCE_NEWLINE}}" elif tag == "emphasis": return self._process_emphasis(element) elif tag == "break": return self._process_break(element) elif tag == "prosody": return self._process_prosody(element) elif tag == "lang": return self._process_language(element) elif tag == "voice": return self._process_voice(element) elif tag == "phoneme": return self._process_phoneme(element) elif tag == "sub": return self._process_substitution(element) elif tag == "say-as": return self._process_say_as(element) elif tag == "audio": return self._process_audio(element) elif tag == "mark": return self._process_mark(element) elif tag == "effect" and namespace == "https://amazon.com/ssml": return self._process_amazon_effect(element) else: # Unknown tag - just process children return self._process_children(element) def _process_children(self, element: ET.Element) -> str: """Process all children of an element. Args: element: Parent element Returns: Combined SSMD string from all children """ result = [] # Add text before first child if element.text: result.append(element.text) # Process each child for child in element: result.append(self._process_element(child)) # Add text after child if child.tail: result.append(child.tail) result_text = "".join(result) return re.sub(r"\s+\n\n\s+", "\n\n", result_text) def _process_emphasis(self, element: ET.Element) -> str: """Convert <emphasis> to *text*, **text**, or _text_. Args: element: emphasis element Returns: SSMD emphasis syntax """ content = self._process_children(element) level = element.get("level", "moderate") if level in ("strong", "x-strong"): return f"**{content}**" elif level == "reduced": return f"_{content}_" elif level == "none": # Level "none" is rare - use explicit annotation return f"[{content}]{{{self._format_attr('emphasis', 'none')}}}" else: # moderate or default return f"*{content}*" def _process_break(self, element: ET.Element) -> str: """Convert <break> to ... notation. Args: element: break element Returns: SSMD break syntax with spaces """ time = element.get("time") strength = element.get("strength") if time: # Parse time value (e.g., "500ms", "2s") match = re.match(r"(\d+)(ms|s)", time) if match: # Breaks have spaces before and after per SSMD spec return f" ...{time} " # Fallback to 1s if time format is invalid return " ...1s " elif strength: marker = SSML_BREAK_STRENGTH_MAP.get(strength, "...s") return f" {marker} " # Default to sentence break return " ...s " def _process_prosody(self, element: ET.Element) -> str: """Convert <prosody> to directive or inline annotation. Args: element: prosody element Returns: SSMD prosody syntax """ content = self._process_children(element) volume = element.get("volume") rate = element.get("rate") pitch = element.get("pitch") # Filter out "medium" default values (ssml-maker adds these) if volume == "medium": volume = None if rate == "medium": rate = None if pitch == "medium": pitch = None if not any([volume, rate, pitch]): return content pairs: list[tuple[str, str]] = [] if volume: pairs.append(("volume", volume)) if rate: pairs.append(("rate", rate)) if pitch: pairs.append(("pitch", pitch)) if not pairs: return content attrs = self._format_attrs(pairs) is_multiline = "\n" in content.strip() or len(content.strip()) > 80 if is_multiline: return self._wrap_directive(content, attrs) return f"[{content}]{{{attrs}}}" def _process_language(self, element: ET.Element) -> str: """Convert <lang> to directive or inline annotation. Args: element: lang element Returns: SSMD language syntax """ content = self._process_children(element) lang = element.get("{http://www.w3.org/XML/1998/namespace}lang") or element.get( "lang" ) if not lang: return content simplified = self.STANDARD_LOCALES.get(lang, lang) is_multiline = "\n" in content.strip() or len(content.strip()) > 80 if element.findall("p"): is_multiline = True lang_attr = self._format_attr("lang", simplified) if is_multiline: return self._wrap_directive(content, lang_attr) return f"[{content}]{{{lang_attr}}}" def _process_voice(self, element: ET.Element) -> str: """Convert <voice> to directive or annotation syntax. Uses directive syntax (<div ...>) for multi-line content, and annotation syntax ([text]{voice="name"}) for single-line content. Args: element: voice element Returns: SSMD voice syntax """ content = self._process_children(element) # Get voice attributes name = element.get("name") language = element.get("language") gender = element.get("gender") variant = element.get("variant") # Check if content is multi-line (use directive syntax) # or single-line (use annotation) is_multiline = "\n" in content.strip() or len(content.strip()) > 80 if element.findall("p"): is_multiline = True # Directive syntax can be used for both simple names and complex attrs use_directive = is_multiline if use_directive: # Use block directive syntax for multi-line voice blocks parts = [] if name: parts.append(self._format_attr("voice", name)) if language: parts.append(self._format_attr("voice-lang", language)) if gender: parts.append(self._format_attr("gender", gender)) if variant: parts.append(self._format_attr("variant", variant)) if parts: attrs = " ".join(parts) return self._wrap_directive(content, attrs) # Use inline annotation syntax if name: # Simple name-only format return f"[{content}]{{{self._format_attr('voice', name)}}}" else: # Complex format with language/gender/variant parts = [] if language: parts.append(self._format_attr("voice-lang", language)) if gender: parts.append(self._format_attr("gender", gender)) if variant: parts.append(self._format_attr("variant", variant)) if parts: annotation = " ".join(parts) return f"[{content}]{{{annotation}}}" return content def _process_phoneme(self, element: ET.Element) -> str: """Convert <phoneme> to [text]{ph="..." alphabet="..."}. Args: element: phoneme element Returns: SSMD phoneme syntax """ content = self._process_children(element) alphabet = element.get("alphabet", "ipa") ph = element.get("ph", "") # Use explicit format: [text]{ph="value" alphabet="type"} attrs = self._format_attrs([("ph", ph), ("alphabet", alphabet)]) return f"[{content}]{{{attrs}}}" def _process_substitution(self, element: ET.Element) -> str: """Convert <sub> to [text]{sub="alias"}. Args: element: sub element Returns: SSMD substitution syntax """ content = self._process_children(element) alias = element.get("alias", "") if alias: return f"[{content}]{{{self._format_attr('sub', alias)}}}" return content def _process_say_as(self, element: ET.Element) -> str: """Convert <say-as> to [text]{as="type"}. Args: element: say-as element Returns: SSMD say-as syntax """ content = self._process_children(element) interpret_as = element.get("interpret-as", "") format_attr = element.get("format") detail_attr = element.get("detail") # Build annotation string parts = [self._format_attr("as", interpret_as)] if format_attr: parts.append(self._format_attr("format", format_attr)) if detail_attr: parts.append(self._format_attr("detail", detail_attr)) annotation = " ".join(parts) if interpret_as: return f"[{content}]{{{annotation}}}" return content def _process_audio(self, element: ET.Element) -> str: """Convert <audio> to [desc]{src="url" ...}. Args: element: audio element Returns: SSMD audio syntax with attributes """ src = element.get("src", "") # Get advanced attributes clip_begin = element.get("clipBegin") clip_end = element.get("clipEnd") speed = element.get("speed") repeat_count = element.get("repeatCount") repeat_dur = element.get("repeatDur") sound_level = element.get("soundLevel") # Extract description and alt text description = "" has_desc_tag = False # Look for <desc> child element desc_elem = element.find("desc") if desc_elem is not None and desc_elem.text: description = desc_elem.text has_desc_tag = True # Get all text content (including text and tail from children) content_text = "" if element.text: content_text = element.text # Get tail text from children (after desc) for child in element: if child.tail: content_text += child.tail content_text = content_text.strip() # If there's no <desc> tag but there is text content, # treat the text as description if not has_desc_tag and content_text: description = content_text if not src: return description if description else content_text pairs = [("src", src)] if clip_begin and clip_end: pairs.append(("clip", f"{clip_begin}-{clip_end}")) if speed: pairs.append(("speed", speed)) if repeat_count: pairs.append(("repeat", repeat_count)) if repeat_dur: pairs.append(("repeatDur", repeat_dur)) if sound_level: pairs.append(("level", sound_level)) if has_desc_tag and content_text: pairs.append(("alt", content_text)) annotation = self._format_attrs([(key, str(value)) for key, value in pairs]) if description: return f"[{description}]{{{annotation}}}" return f"[]{{{annotation}}}" def _process_mark(self, element: ET.Element) -> str: """Convert <mark> to @name. Args: element: mark element Returns: SSMD mark syntax with spaces """ name = element.get("name", "") if name: # Marks have space before and after return f" @{name} " return "" def _process_amazon_effect(self, element: ET.Element) -> str: """Convert Amazon effects to [text]{ext="name"}. Args: element: amazon:effect element Returns: SSMD extension syntax """ content = self._process_children(element) name = element.get("name", "") # Map Amazon effect names to SSMD extensions effect_map = { "whispered": "whisper", "drc": "drc", } ext_name = effect_map.get(name, name) if ext_name: return f"[{content}]{{{self._format_attr('ext', ext_name)}}}" return content def _clean_whitespace(self, text: str) -> str: """Clean up excessive whitespace while preserving paragraph breaks. Args: text: Text to clean Returns: Cleaned text """ # Preserve paragraph breaks (double newlines) text = text.strip("\n") parts = re.split(r"\n\n+", text) cleaned_parts = [] for part in parts: # Collapse multiple spaces, tabs, and single newlines cleaned = re.sub(r"[ \t\n]+", " ", part) cleaned = cleaned.strip() if cleaned: cleaned_parts.append(cleaned) # Join with double newlines for paragraphs return "\n\n".join(cleaned_parts)