"""Utility functions for SSMD processing."""
import html
import re
from collections.abc import Callable
from typing import Any
[docs]
def escape_xml(text: str) -> str:
"""Escape XML special characters.
Args:
text: Input text to escape
Returns:
Text with XML entities escaped
"""
return html.escape(text, quote=True)
[docs]
def unescape_xml(text: str) -> str:
"""Unescape XML entities.
Args:
text: Text with XML entities
Returns:
Unescaped text
"""
return html.unescape(text)
def _normalize_heading_levels(
levels: list[Any],
) -> dict[int, list[tuple[str, str | dict[str, str]]]]:
heading_levels: dict[int, list[tuple[str, str | dict[str, str]]]] = {}
for entry in levels:
if not isinstance(entry, dict):
continue
for level_key, config in entry.items():
if not isinstance(level_key, str) or not level_key.startswith("level_"):
continue
try:
level = int(level_key.split("_", 1)[1])
except (IndexError, ValueError):
continue
if not isinstance(config, dict):
continue
effects: list[tuple[str, str | dict[str, str]]] = []
if "pause_before" in config:
effects.append(("pause_before", str(config["pause_before"])))
if "emphasis" in config:
effects.append(("emphasis", str(config["emphasis"])))
if "pause" in config:
effects.append(("pause", str(config["pause"])))
prosody: dict[str, str] = {}
for key in ("volume", "rate", "pitch"):
if key in config:
prosody[key] = str(config[key])
if prosody:
effects.append(("prosody", prosody))
if effects:
heading_levels[level] = effects
return heading_levels
def _normalize_extensions(
entries: list[Any],
) -> dict[str, Callable[[str], str]]:
extensions: dict[str, Callable[[str], str]] = {}
for entry in entries:
if not isinstance(entry, dict):
continue
for name, config in entry.items():
if not name:
continue
if isinstance(config, dict):
value = config.get("value")
else:
value = config
if not isinstance(value, str):
continue
if "{text}" not in value:
raise ValueError(
f"Extension template for '{name}' must include '{{text}}'."
)
template = value
def _handler(text: str, template: str = template) -> str:
return template.replace("{text}", text)
extensions[str(name)] = _handler
return extensions
# Unicode private use area characters for placeholders
# Using \uf000+ range which is not transformed by phrasplit/spaCy
# (The \ue000-\ue00f range gets converted to dots/ellipses by some NLP tools)
_PLACEHOLDER_MAP = {
"*": "\uf000", # ASTERISK
"_": "\uf001", # UNDERSCORE
"[": "\uf002", # LEFT BRACKET
"]": "\uf003", # RIGHT BRACKET
".": "\uf004", # DOT
"@": "\uf005", # AT SIGN
"#": "\uf006", # HASH
"~": "\uf007", # TILDE
"+": "\uf008", # PLUS
"-": "\uf009", # HYPHEN
"<": "\uf00a", # LESS THAN
">": "\uf00b", # GREATER THAN
"^": "\uf00c", # CARET
}
# Reverse map for unescaping
_REVERSE_PLACEHOLDER_MAP = {v: k for k, v in _PLACEHOLDER_MAP.items()}
[docs]
def escape_ssmd_syntax(
text: str,
patterns: list[str] | None = None,
) -> str:
"""Escape SSMD syntax patterns to prevent interpretation as markup.
Note:
Escaping is reversible but not length-preserving. Any offsets derived from
escaped text should be mapped against the unescaped clean text instead.
This is useful when processing plain text or markdown that may contain
characters that coincidentally match SSMD syntax patterns. Uses placeholder
replacement which is reversed after SSML processing.
Args:
text: Input text that may contain SSMD-like patterns
patterns: List of pattern types to escape. If None, escapes all.
Valid values: 'emphasis', 'annotations', 'breaks', 'marks',
'headings', 'directives'
Returns:
Text with SSMD patterns replaced with placeholders
Example:
>>> text = "This *word* should not be emphasized"
>>> escape_ssmd_syntax(text)
'This \\uf000word\\uf000 should not be emphasized'
>>> text = 'Visit [our site]{src="https://example.com"}'
>>> escaped = escape_ssmd_syntax(text)
# Placeholders prevent SSMD interpretation
>>> # Selective escaping
>>> escape_ssmd_syntax(text, patterns=['emphasis', 'breaks'])
"""
if patterns is None:
# Escape all patterns by default
patterns = [
"emphasis",
"annotations",
"breaks",
"marks",
"headings",
"directives",
]
result = text
# Process patterns in specific order (most specific first)
# Replace special characters with placeholders
if "directives" in patterns:
# Directives at line start: <div ...>
result = re.sub(
r"^(\s*)<div\s+",
lambda m: m.group(1) + _PLACEHOLDER_MAP["<"] + "div ",
result,
flags=re.MULTILINE,
)
result = re.sub(
r"^(\s*)</div>",
lambda m: m.group(1) + _PLACEHOLDER_MAP["<"] + "/div>",
result,
flags=re.MULTILINE,
)
if "headings" in patterns:
# Headings at line start: #, ##, ###
result = re.sub(
r"^(#{1,6})(\s)",
lambda m: _PLACEHOLDER_MAP["#"] * len(m.group(1)) + m.group(2),
result,
flags=re.MULTILINE,
)
if "emphasis" in patterns:
# Strong emphasis: **text**
result = re.sub(
r"\*\*([^*]+)\*\*",
lambda m: _PLACEHOLDER_MAP["*"] * 2
+ m.group(1)
+ _PLACEHOLDER_MAP["*"] * 2,
result,
)
# Moderate emphasis: *text*
result = re.sub(
r"\*([^*\n]+)\*",
lambda m: _PLACEHOLDER_MAP["*"] + m.group(1) + _PLACEHOLDER_MAP["*"],
result,
)
# Reduced emphasis/pitch: _text_ (but not in middle of words)
result = re.sub(
r"(?<!\w)_([^_\n]+)_(?!\w)",
lambda m: _PLACEHOLDER_MAP["_"] + m.group(1) + _PLACEHOLDER_MAP["_"],
result,
)
if "annotations" in patterns:
# Annotations: [text]{params} - replace the brackets
result = re.sub(
r"\[([^\]]+)\]\{([^}]+)\}",
lambda m: _PLACEHOLDER_MAP["["]
+ m.group(1)
+ _PLACEHOLDER_MAP["]"]
+ "{"
+ m.group(2)
+ "}",
result,
)
if "breaks" in patterns:
# Breaks: ...n, ...w, ...c, ...s, ...p, ...500ms, ...5s
result = re.sub(
r"\.\.\.((?:[nwcsp]|\d+(?:ms|s)))(?=\s|$|[.!?,;:])",
lambda m: _PLACEHOLDER_MAP["."] * 3 + m.group(1),
result,
)
if "marks" in patterns:
# Marks: @word
# Require whitespace boundaries to avoid matching handles or emails
result = re.sub(
r"(?<!\S)@(\w+)(?=\s|$)",
lambda m: _PLACEHOLDER_MAP["@"] + m.group(1),
result,
)
return result
[docs]
def unescape_ssmd_syntax(text: str, *, xml_safe: bool = False) -> str:
"""Remove placeholder escaping from SSMD syntax.
This is used internally to replace placeholders with original characters
after TTS processing.
Args:
text: Text with placeholder-escaped SSMD syntax
xml_safe: If True, keep XML special characters escaped when restoring
placeholders (e.g., ``<`` becomes ``<``).
Returns:
Text with placeholders replaced by original characters
Example:
>>> unescape_ssmd_syntax("This \\uf000word\\uf000 is escaped")
'This *word* is escaped'
"""
replacements = dict(_REVERSE_PLACEHOLDER_MAP)
if xml_safe:
replacements[_PLACEHOLDER_MAP["<"]] = "<"
replacements[_PLACEHOLDER_MAP[">"]] = ">"
result = text
# Replace all placeholders with their original characters
for placeholder, original in replacements.items():
result = result.replace(placeholder, original)
return result