Source code for ssmd.capabilities

"""TTS capability definitions and presets.

This module defines which SSML features are supported by various TTS engines
and provides capability-based filtering for SSMD processing.
"""

import importlib.resources as resources
import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any


@dataclass
class CapabilityProfile:
    name: str
    inline_tags: set[str] = field(default_factory=set)
    block_tags: set[str] = field(default_factory=set)
    attributes: dict[str, set[str]] = field(default_factory=dict)
    values: dict[str, set[str]] = field(default_factory=dict)


[docs] class TTSCapabilities: """Define TTS engine capabilities. This class allows you to specify which SSML features your TTS engine supports. Unsupported features will be automatically stripped to plain text. Example: >>> # Basic TTS with minimal support >>> caps = TTSCapabilities( ... emphasis=False, ... break_tags=True, ... prosody=False ... ) >>> >>> parser = SSMD(capabilities=caps) >>> ssml = parser.to_ssml("Hello *world*!") >>> # Output: <speak><p>Hello world!</p></speak> >>> # (emphasis stripped because not supported) """
[docs] def __init__( self, # Core features emphasis: bool = True, break_tags: bool = True, paragraph: bool = True, # Language & pronunciation language: bool = True, phoneme: bool = True, substitution: bool = True, # Prosody (volume, rate, pitch) prosody: bool = True, volume: bool = True, rate: bool = True, pitch: bool = True, # Advanced features say_as: bool = True, audio: bool = True, mark: bool = True, # Extensions (platform-specific) extensions: dict[str, bool] | None = None, # Sentence and heading support sentence_tags: bool = True, heading_emphasis: bool = True, # ssml-green raw capabilities ssml_green: dict[str, bool] | None = None, language_scopes: dict[str, bool] | None = None, ): """Initialize TTS capabilities. Args: emphasis: Support for <emphasis> tags break_tags: Support for <break> tags paragraph: Support for <p> tags language: Support for <lang> tags phoneme: Support for <phoneme> tags substitution: Support for <sub> tags prosody: Support for <prosody> tags (general) volume: Support for volume attribute rate: Support for rate attribute pitch: Support for pitch attribute say_as: Support for <say-as> tags audio: Support for <audio> tags mark: Support for <mark> tags extensions: Dict of extension names and their support sentence_tags: Support for <s> tags heading_emphasis: Support for heading emphasis ssml_green: Raw ssml-green capabilities map (flattened) language_scopes: Optional language scope support map """ self.emphasis = emphasis self.break_tags = break_tags self.paragraph = paragraph self.language = language self.phoneme = phoneme self.substitution = substitution self.prosody = prosody self.volume = volume and prosody self.rate = rate and prosody self.pitch = pitch and prosody self.say_as = say_as self.audio = audio self.mark = mark self.extensions = extensions or {} self.sentence_tags = sentence_tags self.heading_emphasis = heading_emphasis self.ssml_green = ssml_green or {} self.language_scopes = language_scopes or {}
[docs] def to_config(self) -> dict[str, Any]: """Convert capabilities to SSMD config. Returns: Configuration dict for SSMD converter """ config: dict[str, Any] = { "skip": [], "capabilities": self, } # Skip processors for unsupported features if not self.emphasis: config["skip"].append("emphasis") if not self.break_tags: config["skip"].append("break") if not self.paragraph: config["skip"].append("paragraph") if not self.mark: config["skip"].append("mark") # Prosody is handled specially (selective attributes) if not self.prosody: config["skip"].append("prosody") # Headings handled by modifying heading_levels if not self.heading_emphasis: config["heading_levels"] = {} # No heading processing return config
[docs] def supports_extension(self, extension_name: str) -> bool: """Check if an extension is supported. Args: extension_name: Name of the extension Returns: True if supported """ return self.extensions.get(extension_name, False)
[docs] def supports_key(self, key: str, default: bool = True) -> bool: """Check raw ssml-green capability key. Args: key: ssml-green key to check default: Default if key is missing Returns: True if supported """ return self.ssml_green.get(key, default)
# Preset capability definitions for common TTS engines ESPEAK_CAPABILITIES = TTSCapabilities( emphasis=False, # eSpeak doesn't support emphasis break_tags=True, paragraph=False, # eSpeak treats paragraphs as plain text language=True, phoneme=True, # eSpeak has good phoneme support substitution=False, prosody=True, volume=True, rate=True, pitch=True, say_as=True, audio=True, mark=True, sentence_tags=True, heading_emphasis=True, ) PYTTSX3_CAPABILITIES = TTSCapabilities( emphasis=False, # pyttsx3 has minimal SSML support break_tags=False, paragraph=False, language=False, # Voice selection, not SSML phoneme=False, substitution=False, prosody=True, # Via properties, not SSML volume=True, rate=True, pitch=False, say_as=False, audio=False, mark=False, sentence_tags=False, heading_emphasis=False, ) GOOGLE_TTS_CAPABILITIES = TTSCapabilities( emphasis=True, break_tags=True, paragraph=True, language=True, phoneme=True, substitution=True, prosody=True, volume=True, rate=True, pitch=True, say_as=True, audio=True, mark=True, sentence_tags=True, heading_emphasis=True, ) AMAZON_POLLY_CAPABILITIES = TTSCapabilities( emphasis=True, break_tags=True, paragraph=True, language=True, phoneme=True, substitution=True, prosody=True, volume=True, rate=True, pitch=True, say_as=True, audio=False, # Limited audio support mark=True, extensions={"whisper": True, "drc": True}, # Amazon-specific sentence_tags=True, heading_emphasis=True, ) AZURE_TTS_CAPABILITIES = TTSCapabilities( emphasis=True, break_tags=True, paragraph=True, language=True, phoneme=True, substitution=True, prosody=True, volume=True, rate=True, pitch=True, say_as=True, audio=True, mark=True, sentence_tags=True, heading_emphasis=True, ) # Minimal fallback (plain text only) MINIMAL_CAPABILITIES = TTSCapabilities( emphasis=False, break_tags=False, paragraph=False, language=False, phoneme=False, substitution=False, prosody=False, say_as=False, audio=False, mark=False, sentence_tags=False, heading_emphasis=False, ) # Full SSML support (reference) FULL_CAPABILITIES = TTSCapabilities() SSMD_CORE_PROFILE = CapabilityProfile( name="ssmd-core", inline_tags={ "emphasis", "break", "lang", "voice", "mark", "phoneme", "prosody", "say-as", "sub", "audio", "extension", }, block_tags={ "div", "heading", "paragraph", }, attributes={ "audio": { "src", "clip", "speed", "repeat", "repeatDur", "level", "alt", }, "emphasis": {"level"}, "lang": {"lang"}, "phoneme": {"ph", "ipa", "sampa", "alphabet"}, "prosody": {"volume", "rate", "pitch", "v", "r", "p"}, "say-as": {"as", "format", "detail"}, "sub": {"sub"}, "voice": {"voice", "voice-lang", "gender", "variant"}, "div": { "lang", "voice", "voice-lang", "gender", "variant", "volume", "rate", "pitch", }, "heading": {"level"}, "break": {"time", "strength"}, "mark": {"name"}, "extension": {"ext"}, }, ) KOKORO_PROFILE = CapabilityProfile( name="kokoro", inline_tags={tag for tag in SSMD_CORE_PROFILE.inline_tags if tag != "extension"}, block_tags=SSMD_CORE_PROFILE.block_tags.copy(), attributes={ key: value.copy() for key, value in SSMD_CORE_PROFILE.attributes.items() if key != "extension" }, ) GOOGLE_SSML_PROFILE = CapabilityProfile( name="google-ssml", inline_tags=SSMD_CORE_PROFILE.inline_tags.copy(), block_tags=SSMD_CORE_PROFILE.block_tags.copy(), attributes={ key: value.copy() for key, value in SSMD_CORE_PROFILE.attributes.items() }, ) PROFILES: dict[str, CapabilityProfile] = { "ssmd-core": SSMD_CORE_PROFILE, "kokoro": KOKORO_PROFILE, "google-ssml": GOOGLE_SSML_PROFILE, } def get_profile(name: str) -> CapabilityProfile: profile = PROFILES.get(name) if profile is None: available = ", ".join(sorted(PROFILES.keys())) raise ValueError(f"Unknown profile '{name}'. Available: {available}") return profile def list_profiles() -> list[str]: return sorted(PROFILES.keys()) def _flatten_ssml_green(data: dict[str, Any]) -> dict[str, bool]: flat: dict[str, bool] = {} for section in data.values(): if not isinstance(section, dict): continue for key, value in section.items(): if isinstance(value, bool): flat[key] = value return flat def _load_ssml_green_data(data: str) -> TTSCapabilities: flat = _flatten_ssml_green(json.loads(data)) emphasis = flat.get("elements››level (optional)", True) if emphasis: level_values = [ 'attribute values››level="strong"', 'attribute values››level="moderate" (default)', 'attribute values››level="none"', 'attribute values››level="reduced"', ] if any(k in flat for k in level_values) and not any( flat.get(k, False) for k in level_values ): emphasis = False break_tags = flat.get("elements››strength (optional)", True) or flat.get( "elements››time (optional)", True ) phoneme = flat.get("elements››ph (required)", True) substitution = flat.get("elements››alias (required)", True) prosody = ( flat.get("elements››rate (optional)", True) or flat.get("elements››pitch (optional)", True) or flat.get("elements››volume (optional)", True) ) language_root = flat.get("elements››xml:lang (required)", True) language_sentence = flat.get("elements›~~(sentence)›xml:lang (optional)", True) language_paragraph = flat.get("elements› (paragraph)›xml:lang (optional)", True) language = language_root or language_sentence or language_paragraph say_as = flat.get("elements››interpret-as (required)", True) caps = TTSCapabilities( emphasis=emphasis, break_tags=break_tags, paragraph=True, language=language, phoneme=phoneme, substitution=substitution, prosody=prosody, volume=flat.get("elements››volume (optional)", True), rate=flat.get("elements››rate (optional)", True), pitch=flat.get("elements››pitch (optional)", True), say_as=say_as, ssml_green=flat, language_scopes={ "root": language_root, "sentence": language_sentence, "paragraph": language_paragraph, }, ) return caps def load_ssml_green_platform(path: str | Path) -> TTSCapabilities: return _load_ssml_green_data(Path(path).read_text(encoding="utf-8")) # Preset lookup PRESETS: dict[str, TTSCapabilities] = { "espeak": ESPEAK_CAPABILITIES, "pyttsx3": PYTTSX3_CAPABILITIES, "google": GOOGLE_TTS_CAPABILITIES, "polly": AMAZON_POLLY_CAPABILITIES, "amazon": AMAZON_POLLY_CAPABILITIES, "azure": AZURE_TTS_CAPABILITIES, "microsoft": AZURE_TTS_CAPABILITIES, "minimal": MINIMAL_CAPABILITIES, "full": FULL_CAPABILITIES, } SSML_GREEN_FILES = { "alexa": "amazon-alexa.json", "amazon": "amazon-polly.json", "polly": "amazon-polly.json", "google": "google-home.json", "ibm": "ibm-watson.json", "watson": "ibm-watson.json", "azure": "microsoft-azure.json", "microsoft": "microsoft-azure.json", "cortana": "microsoft-cortana.json", } def _load_ssml_green_preset(name: str) -> TTSCapabilities | None: file_name = SSML_GREEN_FILES.get(name) if not file_name: return None try: data_path = resources.files("ssmd").joinpath("data").joinpath(file_name) if data_path.is_file(): return _load_ssml_green_data(data_path.read_text(encoding="utf-8")) except Exception: pass data_dir = Path(__file__).parent / "data" file_path = data_dir / file_name if file_path.exists(): return load_ssml_green_platform(file_path) return None
[docs] def get_preset(name: str) -> TTSCapabilities: """Get a preset capability configuration. Args: name: Preset name (espeak, pyttsx3, google, polly, azure, minimal, full) Returns: TTSCapabilities instance Raises: ValueError: If preset not found """ preset_name = name.lower() ssml_green_caps = _load_ssml_green_preset(preset_name) if ssml_green_caps is not None: preset = PRESETS.get(preset_name) if preset and preset.extensions: ssml_green_caps.extensions = preset.extensions.copy() return ssml_green_caps if preset_name not in PRESETS: available = ", ".join(sorted(PRESETS.keys())) raise ValueError(f"Unknown preset '{name}'. Available: {available}") return PRESETS[preset_name]