"""SSML to SSMD converter - reverse conversion."""
import re
import xml.etree.ElementTree as ET
from typing import TYPE_CHECKING, Any
from ssmd.formatter import format_ssmd
from ssmd.parser import parse_sentences
from ssmd.ssml_conversions import SSML_BREAK_STRENGTH_MAP
from ssmd.utils import format_ssmd_attr
if TYPE_CHECKING:
from ssmd.capabilities import TTSCapabilities
[docs]
class SSMLParser:
"""Convert SSML to SSMD markdown format.
This class provides the reverse conversion from SSML XML to the more
human-readable SSMD markdown syntax.
Example:
>>> parser = SSMLParser()
>>> ssml = '<speak><emphasis>Hello</emphasis> world</speak>'
>>> ssmd = parser.to_ssmd(ssml)
>>> print(ssmd)
'*Hello* world'
"""
# Standard locales that can be simplified (locale -> language code)
STANDARD_LOCALES = {
"en-US": "en",
"en-GB": "en-GB", # Keep non-US English locales
"de-DE": "de",
"fr-FR": "fr",
"es-ES": "es",
"it-IT": "it",
"pt-PT": "pt",
"ru-RU": "ru",
"zh-CN": "zh",
"ja-JP": "ja",
"ko-KR": "ko",
}
[docs]
def __init__(self, config: dict[str, Any] | None = None):
"""Initialize SSML parser.
Args:
config: Optional configuration dictionary
"""
self.config = config or {}
def _format_attr(self, key: str, value: str) -> str:
return format_ssmd_attr(key, value)
def _format_attrs(self, pairs: list[tuple[str, str]]) -> str:
return " ".join(self._format_attr(key, value) for key, value in pairs)
def _wrap_directive(self, content: str, attrs: str) -> str:
content = content.strip()
return f"<div {attrs}>{{DIRECTIVE_NEWLINE}}{content}{{DIRECTIVE_NEWLINE}}</div>"
def _element_namespace(self, element: ET.Element) -> str | None:
if element.tag.startswith("{"):
return element.tag.split("}")[0][1:]
return None
[docs]
def to_ssmd(
self, ssml: str, *, capabilities: "TTSCapabilities | str | None" = None
) -> str:
"""Convert SSML to SSMD format.
Args:
ssml: SSML XML string
capabilities: Optional TTS capabilities (preset name or object)
Returns:
SSMD markdown string with proper formatting (each sentence on new line)
Example:
>>> parser = SSMLParser()
>>> parser.to_ssmd('<speak><emphasis>Hello</emphasis></speak>')
'*Hello*'
"""
# Wrap in <speak> if not already wrapped
if not ssml.strip().startswith("<speak"):
ssml = f"<speak>{ssml}</speak>"
# Register common SSML namespaces
try:
ET.register_namespace("amazon", "https://amazon.com/ssml")
except Exception:
pass # Namespace might already be registered
try:
root = ET.fromstring(ssml)
except ET.ParseError as e:
raise ValueError(f"Invalid SSML XML: {e}") from e
# Process the root element
result = self._process_element(root)
# Clean up whitespace
result = self._clean_whitespace(result)
# Restore directive and sentence newlines (protected during whitespace cleaning)
result = (
result.replace("{DIRECTIVE_NEWLINE}", "\n")
.replace("{SENTENCE_NEWLINE}", "\n")
.strip()
)
# Parse into sentences and format with proper line breaks
sentences = parse_sentences(
result.strip(),
capabilities=capabilities,
strict_parse=capabilities is not None,
)
return format_ssmd(sentences)
def _process_element(self, element: ET.Element) -> str:
"""Process an XML element and its children recursively.
Args:
element: XML element to process
Returns:
SSMD formatted string
"""
tag = element.tag.split("}")[-1] # Remove namespace if present
namespace = self._element_namespace(element)
# Handle different SSML tags
if tag == "speak":
return self._process_children(element)
elif tag == "p":
content = self._process_children(element)
# Paragraphs are separated by double newlines
return f"{content}\n\n"
elif tag == "s":
# Sentences - preserve explicit line breaks
return f"{self._process_children(element)}{{SENTENCE_NEWLINE}}"
elif tag == "emphasis":
return self._process_emphasis(element)
elif tag == "break":
return self._process_break(element)
elif tag == "prosody":
return self._process_prosody(element)
elif tag == "lang":
return self._process_language(element)
elif tag == "voice":
return self._process_voice(element)
elif tag == "phoneme":
return self._process_phoneme(element)
elif tag == "sub":
return self._process_substitution(element)
elif tag == "say-as":
return self._process_say_as(element)
elif tag == "audio":
return self._process_audio(element)
elif tag == "mark":
return self._process_mark(element)
elif tag == "effect" and namespace == "https://amazon.com/ssml":
return self._process_amazon_effect(element)
else:
# Unknown tag - just process children
return self._process_children(element)
def _process_children(self, element: ET.Element) -> str:
"""Process all children of an element.
Args:
element: Parent element
Returns:
Combined SSMD string from all children
"""
result = []
# Add text before first child
if element.text:
result.append(element.text)
# Process each child
for child in element:
result.append(self._process_element(child))
# Add text after child
if child.tail:
result.append(child.tail)
result_text = "".join(result)
return re.sub(r"\s+\n\n\s+", "\n\n", result_text)
def _process_emphasis(self, element: ET.Element) -> str:
"""Convert <emphasis> to *text*, **text**, or _text_.
Args:
element: emphasis element
Returns:
SSMD emphasis syntax
"""
content = self._process_children(element)
level = element.get("level", "moderate")
if level in ("strong", "x-strong"):
return f"**{content}**"
elif level == "reduced":
return f"_{content}_"
elif level == "none":
# Level "none" is rare - use explicit annotation
return f"[{content}]{{{self._format_attr('emphasis', 'none')}}}"
else: # moderate or default
return f"*{content}*"
def _process_break(self, element: ET.Element) -> str:
"""Convert <break> to ... notation.
Args:
element: break element
Returns:
SSMD break syntax with spaces
"""
time = element.get("time")
strength = element.get("strength")
if time:
# Parse time value (e.g., "500ms", "2s")
match = re.match(r"(\d+)(ms|s)", time)
if match:
# Breaks have spaces before and after per SSMD spec
return f" ...{time} "
# Fallback to 1s if time format is invalid
return " ...1s "
elif strength:
marker = SSML_BREAK_STRENGTH_MAP.get(strength, "...s")
return f" {marker} "
# Default to sentence break
return " ...s "
def _process_prosody(self, element: ET.Element) -> str:
"""Convert <prosody> to directive or inline annotation.
Args:
element: prosody element
Returns:
SSMD prosody syntax
"""
content = self._process_children(element)
volume = element.get("volume")
rate = element.get("rate")
pitch = element.get("pitch")
# Filter out "medium" default values (ssml-maker adds these)
if volume == "medium":
volume = None
if rate == "medium":
rate = None
if pitch == "medium":
pitch = None
if not any([volume, rate, pitch]):
return content
pairs: list[tuple[str, str]] = []
if volume:
pairs.append(("volume", volume))
if rate:
pairs.append(("rate", rate))
if pitch:
pairs.append(("pitch", pitch))
if not pairs:
return content
attrs = self._format_attrs(pairs)
is_multiline = "\n" in content.strip() or len(content.strip()) > 80
if is_multiline:
return self._wrap_directive(content, attrs)
return f"[{content}]{{{attrs}}}"
def _process_language(self, element: ET.Element) -> str:
"""Convert <lang> to directive or inline annotation.
Args:
element: lang element
Returns:
SSMD language syntax
"""
content = self._process_children(element)
lang = element.get("{http://www.w3.org/XML/1998/namespace}lang") or element.get(
"lang"
)
if not lang:
return content
simplified = self.STANDARD_LOCALES.get(lang, lang)
is_multiline = "\n" in content.strip() or len(content.strip()) > 80
if element.findall("p"):
is_multiline = True
lang_attr = self._format_attr("lang", simplified)
if is_multiline:
return self._wrap_directive(content, lang_attr)
return f"[{content}]{{{lang_attr}}}"
def _process_voice(self, element: ET.Element) -> str:
"""Convert <voice> to directive or annotation syntax.
Uses directive syntax (<div ...>) for multi-line content,
and annotation syntax ([text]{voice="name"}) for single-line content.
Args:
element: voice element
Returns:
SSMD voice syntax
"""
content = self._process_children(element)
# Get voice attributes
name = element.get("name")
language = element.get("language")
gender = element.get("gender")
variant = element.get("variant")
# Check if content is multi-line (use directive syntax)
# or single-line (use annotation)
is_multiline = "\n" in content.strip() or len(content.strip()) > 80
if element.findall("p"):
is_multiline = True
# Directive syntax can be used for both simple names and complex attrs
use_directive = is_multiline
if use_directive:
# Use block directive syntax for multi-line voice blocks
parts = []
if name:
parts.append(self._format_attr("voice", name))
if language:
parts.append(self._format_attr("voice-lang", language))
if gender:
parts.append(self._format_attr("gender", gender))
if variant:
parts.append(self._format_attr("variant", variant))
if parts:
attrs = " ".join(parts)
return self._wrap_directive(content, attrs)
# Use inline annotation syntax
if name:
# Simple name-only format
return f"[{content}]{{{self._format_attr('voice', name)}}}"
else:
# Complex format with language/gender/variant
parts = []
if language:
parts.append(self._format_attr("voice-lang", language))
if gender:
parts.append(self._format_attr("gender", gender))
if variant:
parts.append(self._format_attr("variant", variant))
if parts:
annotation = " ".join(parts)
return f"[{content}]{{{annotation}}}"
return content
def _process_phoneme(self, element: ET.Element) -> str:
"""Convert <phoneme> to [text]{ph="..." alphabet="..."}.
Args:
element: phoneme element
Returns:
SSMD phoneme syntax
"""
content = self._process_children(element)
alphabet = element.get("alphabet", "ipa")
ph = element.get("ph", "")
# Use explicit format: [text]{ph="value" alphabet="type"}
attrs = self._format_attrs([("ph", ph), ("alphabet", alphabet)])
return f"[{content}]{{{attrs}}}"
def _process_substitution(self, element: ET.Element) -> str:
"""Convert <sub> to [text]{sub="alias"}.
Args:
element: sub element
Returns:
SSMD substitution syntax
"""
content = self._process_children(element)
alias = element.get("alias", "")
if alias:
return f"[{content}]{{{self._format_attr('sub', alias)}}}"
return content
def _process_say_as(self, element: ET.Element) -> str:
"""Convert <say-as> to [text]{as="type"}.
Args:
element: say-as element
Returns:
SSMD say-as syntax
"""
content = self._process_children(element)
interpret_as = element.get("interpret-as", "")
format_attr = element.get("format")
detail_attr = element.get("detail")
# Build annotation string
parts = [self._format_attr("as", interpret_as)]
if format_attr:
parts.append(self._format_attr("format", format_attr))
if detail_attr:
parts.append(self._format_attr("detail", detail_attr))
annotation = " ".join(parts)
if interpret_as:
return f"[{content}]{{{annotation}}}"
return content
def _process_audio(self, element: ET.Element) -> str:
"""Convert <audio> to [desc]{src="url" ...}.
Args:
element: audio element
Returns:
SSMD audio syntax with attributes
"""
src = element.get("src", "")
# Get advanced attributes
clip_begin = element.get("clipBegin")
clip_end = element.get("clipEnd")
speed = element.get("speed")
repeat_count = element.get("repeatCount")
repeat_dur = element.get("repeatDur")
sound_level = element.get("soundLevel")
# Extract description and alt text
description = ""
has_desc_tag = False
# Look for <desc> child element
desc_elem = element.find("desc")
if desc_elem is not None and desc_elem.text:
description = desc_elem.text
has_desc_tag = True
# Get all text content (including text and tail from children)
content_text = ""
if element.text:
content_text = element.text
# Get tail text from children (after desc)
for child in element:
if child.tail:
content_text += child.tail
content_text = content_text.strip()
# If there's no <desc> tag but there is text content,
# treat the text as description
if not has_desc_tag and content_text:
description = content_text
if not src:
return description if description else content_text
pairs = [("src", src)]
if clip_begin and clip_end:
pairs.append(("clip", f"{clip_begin}-{clip_end}"))
if speed:
pairs.append(("speed", speed))
if repeat_count:
pairs.append(("repeat", repeat_count))
if repeat_dur:
pairs.append(("repeatDur", repeat_dur))
if sound_level:
pairs.append(("level", sound_level))
if has_desc_tag and content_text:
pairs.append(("alt", content_text))
annotation = self._format_attrs([(key, str(value)) for key, value in pairs])
if description:
return f"[{description}]{{{annotation}}}"
return f"[]{{{annotation}}}"
def _process_mark(self, element: ET.Element) -> str:
"""Convert <mark> to @name.
Args:
element: mark element
Returns:
SSMD mark syntax with spaces
"""
name = element.get("name", "")
if name:
# Marks have space before and after
return f" @{name} "
return ""
def _process_amazon_effect(self, element: ET.Element) -> str:
"""Convert Amazon effects to [text]{ext="name"}.
Args:
element: amazon:effect element
Returns:
SSMD extension syntax
"""
content = self._process_children(element)
name = element.get("name", "")
# Map Amazon effect names to SSMD extensions
effect_map = {
"whispered": "whisper",
"drc": "drc",
}
ext_name = effect_map.get(name, name)
if ext_name:
return f"[{content}]{{{self._format_attr('ext', ext_name)}}}"
return content
def _clean_whitespace(self, text: str) -> str:
"""Clean up excessive whitespace while preserving paragraph breaks.
Args:
text: Text to clean
Returns:
Cleaned text
"""
# Preserve paragraph breaks (double newlines)
text = text.strip("\n")
parts = re.split(r"\n\n+", text)
cleaned_parts = []
for part in parts:
# Collapse multiple spaces, tabs, and single newlines
cleaned = re.sub(r"[ \t\n]+", " ", part)
cleaned = cleaned.strip()
if cleaned:
cleaned_parts.append(cleaned)
# Join with double newlines for paragraphs
return "\n\n".join(cleaned_parts)