Source code for ssmd.utils

"""Utility functions for SSMD processing."""

import html
import re
from collections.abc import Callable
from typing import Any


[docs] def escape_xml(text: str) -> str: """Escape XML special characters. Args: text: Input text to escape Returns: Text with XML entities escaped """ return html.escape(text, quote=True)
[docs] def unescape_xml(text: str) -> str: """Unescape XML entities. Args: text: Text with XML entities Returns: Unescaped text """ return html.unescape(text)
[docs] def format_ssmd_attr(key: str, value: str) -> str: """Format a key/value pair for SSMD annotations.""" raw_value = str(value) quote = "'" if '"' in raw_value and "'" not in raw_value else '"' escaped = raw_value.replace("\\", "\\\\") escaped = escaped.replace("{", "\\{").replace("}", "\\}") if quote == '"': escaped = escaped.replace('"', '\\"') else: escaped = escaped.replace("'", "\\'") return f"{key}={quote}{escaped}{quote}"
[docs] def format_xml(xml_text: str, pretty: bool = True) -> str: """Format XML with optional pretty printing. Args: xml_text: XML string to format pretty: Enable pretty printing Returns: Formatted XML string """ if not pretty: return xml_text try: from xml.dom import minidom dom = minidom.parseString(xml_text) formatted = dom.toprettyxml(indent=" ", encoding=None) lines = [line for line in formatted.splitlines() if line.strip()] if lines and lines[0].startswith("<?xml"): lines = lines[1:] return "\n".join(lines) except Exception: # Fallback: return as-is if parsing fails return xml_text
[docs] def parse_yaml_header(text: str) -> tuple[dict[str, Any] | None, str]: """Parse YAML front matter from SSMD text. Supports YAML headers wrapped in --- ... --- or --- ... ... . Returns: Tuple of (header_dict, body_text) """ if not text.startswith("---"): return None, text lines = text.splitlines() if not lines or lines[0].strip() != "---": return None, text end_index = None for i in range(1, len(lines)): if lines[i].strip() in {"---", "..."}: end_index = i break if end_index is None: return None, text header_text = "\n".join(lines[1:end_index]) body_text = "\n".join(lines[end_index + 1 :]).lstrip("\n") try: import yaml # type: ignore[import-untyped] except ImportError as exc: raise RuntimeError("pyyaml is required for YAML header parsing") from exc header = yaml.safe_load(header_text) or {} if not isinstance(header, dict): return None, body_text return header, body_text
def _normalize_heading_levels( levels: list[Any], ) -> dict[int, list[tuple[str, str | dict[str, str]]]]: heading_levels: dict[int, list[tuple[str, str | dict[str, str]]]] = {} for entry in levels: if not isinstance(entry, dict): continue for level_key, config in entry.items(): if not isinstance(level_key, str) or not level_key.startswith("level_"): continue try: level = int(level_key.split("_", 1)[1]) except (IndexError, ValueError): continue if not isinstance(config, dict): continue effects: list[tuple[str, str | dict[str, str]]] = [] if "pause_before" in config: effects.append(("pause_before", str(config["pause_before"]))) if "emphasis" in config: effects.append(("emphasis", str(config["emphasis"]))) if "pause" in config: effects.append(("pause", str(config["pause"]))) prosody: dict[str, str] = {} for key in ("volume", "rate", "pitch"): if key in config: prosody[key] = str(config[key]) if prosody: effects.append(("prosody", prosody)) if effects: heading_levels[level] = effects return heading_levels def _normalize_extensions( entries: list[Any], ) -> dict[str, Callable[[str], str]]: extensions: dict[str, Callable[[str], str]] = {} for entry in entries: if not isinstance(entry, dict): continue for name, config in entry.items(): if not name: continue if isinstance(config, dict): value = config.get("value") else: value = config if not isinstance(value, str): continue if "{text}" not in value: raise ValueError( f"Extension template for '{name}' must include '{{text}}'." ) template = value def _handler(text: str, template: str = template) -> str: return template.replace("{text}", text) extensions[str(name)] = _handler return extensions
[docs] def build_config_from_header(header: dict[str, Any]) -> dict[str, Any]: config: dict[str, Any] = {} heading_entries = header.get("heading") if isinstance(heading_entries, list): heading_levels = _normalize_heading_levels(heading_entries) if heading_levels: config["heading_levels"] = heading_levels extension_entries = header.get("extensions") if isinstance(extension_entries, list): extensions = _normalize_extensions(extension_entries) if extensions: config["extensions"] = extensions return config
[docs] def extract_sentences(ssml: str) -> list[str]: """Extract sentences from SSML. Looks for <s> tags or falls back to <p> tags or <speak> content. Args: ssml: SSML string Returns: List of SSML sentence strings """ def _local_name(tag: str) -> str: return tag.split("}")[-1] try: import xml.etree.ElementTree as ET root = ET.fromstring(ssml) s_elements = [elem for elem in root.iter() if _local_name(elem.tag) == "s"] if s_elements: return [ET.tostring(elem, encoding="unicode") for elem in s_elements] p_elements = [elem for elem in root.iter() if _local_name(elem.tag) == "p"] if p_elements: return [ET.tostring(elem, encoding="unicode") for elem in p_elements] parts: list[str] = [] if root.text: parts.append(root.text) for child in root: parts.append(ET.tostring(child, encoding="unicode")) if child.tail: parts.append(child.tail) clean = "".join(parts).strip() return [clean] if clean else [] except Exception: # First try to extract <s> tags (fallback regex, including attributes) s_tag_pattern = re.compile(r"<s\b[^>]*>(.*?)</s>", re.DOTALL) sentences = s_tag_pattern.findall(ssml) if sentences: return sentences # Fallback: extract <p> tags p_tag_pattern = re.compile(r"<p\b[^>]*>(.*?)</p>", re.DOTALL) paragraphs = p_tag_pattern.findall(ssml) if paragraphs: return paragraphs # Last resort: remove <speak> wrapper and return as single sentence clean = re.sub(r"</?speak>", "", ssml).strip() return [clean] if clean else []
# Unicode private use area characters for placeholders # Using \uf000+ range which is not transformed by phrasplit/spaCy # (The \ue000-\ue00f range gets converted to dots/ellipses by some NLP tools) _PLACEHOLDER_MAP = { "*": "\uf000", # ASTERISK "_": "\uf001", # UNDERSCORE "[": "\uf002", # LEFT BRACKET "]": "\uf003", # RIGHT BRACKET ".": "\uf004", # DOT "@": "\uf005", # AT SIGN "#": "\uf006", # HASH "~": "\uf007", # TILDE "+": "\uf008", # PLUS "-": "\uf009", # HYPHEN "<": "\uf00a", # LESS THAN ">": "\uf00b", # GREATER THAN "^": "\uf00c", # CARET } # Reverse map for unescaping _REVERSE_PLACEHOLDER_MAP = {v: k for k, v in _PLACEHOLDER_MAP.items()}
[docs] def escape_ssmd_syntax( text: str, patterns: list[str] | None = None, ) -> str: """Escape SSMD syntax patterns to prevent interpretation as markup. Note: Escaping is reversible but not length-preserving. Any offsets derived from escaped text should be mapped against the unescaped clean text instead. This is useful when processing plain text or markdown that may contain characters that coincidentally match SSMD syntax patterns. Uses placeholder replacement which is reversed after SSML processing. Args: text: Input text that may contain SSMD-like patterns patterns: List of pattern types to escape. If None, escapes all. Valid values: 'emphasis', 'annotations', 'breaks', 'marks', 'headings', 'directives' Returns: Text with SSMD patterns replaced with placeholders Example: >>> text = "This *word* should not be emphasized" >>> escape_ssmd_syntax(text) 'This \\uf000word\\uf000 should not be emphasized' >>> text = 'Visit [our site]{src="https://example.com"}' >>> escaped = escape_ssmd_syntax(text) # Placeholders prevent SSMD interpretation >>> # Selective escaping >>> escape_ssmd_syntax(text, patterns=['emphasis', 'breaks']) """ if patterns is None: # Escape all patterns by default patterns = [ "emphasis", "annotations", "breaks", "marks", "headings", "directives", ] result = text # Process patterns in specific order (most specific first) # Replace special characters with placeholders if "directives" in patterns: # Directives at line start: <div ...> result = re.sub( r"^(\s*)<div\s+", lambda m: m.group(1) + _PLACEHOLDER_MAP["<"] + "div ", result, flags=re.MULTILINE, ) result = re.sub( r"^(\s*)</div>", lambda m: m.group(1) + _PLACEHOLDER_MAP["<"] + "/div>", result, flags=re.MULTILINE, ) if "headings" in patterns: # Headings at line start: #, ##, ### result = re.sub( r"^(#{1,6})(\s)", lambda m: _PLACEHOLDER_MAP["#"] * len(m.group(1)) + m.group(2), result, flags=re.MULTILINE, ) if "emphasis" in patterns: # Strong emphasis: **text** result = re.sub( r"\*\*([^*]+)\*\*", lambda m: _PLACEHOLDER_MAP["*"] * 2 + m.group(1) + _PLACEHOLDER_MAP["*"] * 2, result, ) # Moderate emphasis: *text* result = re.sub( r"\*([^*\n]+)\*", lambda m: _PLACEHOLDER_MAP["*"] + m.group(1) + _PLACEHOLDER_MAP["*"], result, ) # Reduced emphasis/pitch: _text_ (but not in middle of words) result = re.sub( r"(?<!\w)_([^_\n]+)_(?!\w)", lambda m: _PLACEHOLDER_MAP["_"] + m.group(1) + _PLACEHOLDER_MAP["_"], result, ) if "annotations" in patterns: # Annotations: [text]{params} - replace the brackets result = re.sub( r"\[([^\]]+)\]\{([^}]+)\}", lambda m: _PLACEHOLDER_MAP["["] + m.group(1) + _PLACEHOLDER_MAP["]"] + "{" + m.group(2) + "}", result, ) if "breaks" in patterns: # Breaks: ...n, ...w, ...c, ...s, ...p, ...500ms, ...5s result = re.sub( r"\.\.\.((?:[nwcsp]|\d+(?:ms|s)))(?=\s|$|[.!?,;:])", lambda m: _PLACEHOLDER_MAP["."] * 3 + m.group(1), result, ) if "marks" in patterns: # Marks: @word # Require whitespace boundaries to avoid matching handles or emails result = re.sub( r"(?<!\S)@(\w+)(?=\s|$)", lambda m: _PLACEHOLDER_MAP["@"] + m.group(1), result, ) return result
[docs] def unescape_ssmd_syntax(text: str, *, xml_safe: bool = False) -> str: """Remove placeholder escaping from SSMD syntax. This is used internally to replace placeholders with original characters after TTS processing. Args: text: Text with placeholder-escaped SSMD syntax xml_safe: If True, keep XML special characters escaped when restoring placeholders (e.g., ``<`` becomes ``&lt;``). Returns: Text with placeholders replaced by original characters Example: >>> unescape_ssmd_syntax("This \\uf000word\\uf000 is escaped") 'This *word* is escaped' """ replacements = dict(_REVERSE_PLACEHOLDER_MAP) if xml_safe: replacements[_PLACEHOLDER_MAP["<"]] = "&lt;" replacements[_PLACEHOLDER_MAP[">"]] = "&gt;" result = text # Replace all placeholders with their original characters for placeholder, original in replacements.items(): result = result.replace(placeholder, original) return result