"""SSMD Document - Main document container with rich TTS features."""
from collections.abc import Iterator
from typing import TYPE_CHECKING, Any, overload
from ssmd.formatter import format_ssmd
from ssmd.paragraph import Paragraph
from ssmd.parser import parse_paragraphs, parse_sentences
from ssmd.utils import (
build_config_from_header,
format_xml,
)
from ssmd.utils import (
parse_yaml_header as parse_yaml_front_matter,
)
if TYPE_CHECKING:
from ssmd.capabilities import TTSCapabilities
from ssmd.sentence import Sentence
[docs]
class Document:
"""Main SSMD document container with incremental building and editing.
This is the primary interface for working with SSMD documents. It provides
a clean, document-centric API for creating, editing, and exporting TTS content.
The Document stores content as fragments (pieces of text) with separators
between them, allowing efficient incremental building and editing while
preserving the document structure.
Example:
Basic usage::
import ssmd
# Create and build a document
doc = ssmd.Document()
doc.add_sentence("Hello world!")
doc.add_sentence("This is SSMD.")
# Export to different formats
ssml = doc.to_ssml()
text = doc.to_text()
# Iterate for streaming TTS
for sentence in doc.sentences():
tts_engine.speak(sentence)
Advanced usage::
# Load from SSML
doc = ssmd.Document.from_ssml("<speak>Hello</speak>")
# Edit the document
doc[0] = "Modified content"
doc.add_paragraph("New paragraph")
# Access raw content
print(doc.ssmd) # Raw SSMD markdown
"""
[docs]
def __init__(
self,
content: str = "",
config: dict[str, Any] | None = None,
capabilities: "TTSCapabilities | str | None" = None,
escape_syntax: bool = False,
escape_patterns: list[str] | None = None,
parse_yaml_header: bool = False,
strict: bool = False,
) -> None:
"""Initialize a new SSMD document.
Args:
content: Optional initial SSMD content
config: Configuration dictionary with options:
- skip (list): Processor names to skip
- output_speak_tag (bool): Wrap in <speak> tags (default: True)
- pretty_print (bool): Format XML output (default: False)
- auto_sentence_tags (bool): Auto-wrap sentences (default: False)
- heading_levels (dict): Custom heading configurations
- extensions (dict): Registered extension handlers
- namespaces (dict): XML namespaces to add to the <speak> tag
- sentence_model_size (str): spaCy model size for sentence
detection ("sm", "md", "lg", "trf"). Default: "sm"
- sentence_spacy_model (str): Deprecated alias; model size is
inferred from the name (overrides sentence_model_size)
- sentence_use_spacy (bool): If False, use fast regex splitting
instead of spaCy. Default: True
capabilities: TTS capabilities (TTSCapabilities instance or
preset name). Presets: 'espeak', 'pyttsx3', 'google',
'polly', 'azure', 'minimal', 'full'
escape_syntax: If True, escape SSMD-like syntax in content to
prevent interpretation as markup. Useful for plain text or
markdown that may coincidentally contain SSMD patterns.
escape_patterns: List of specific pattern types to escape when
escape_syntax=True. If None, escapes all patterns.
Valid values: 'emphasis', 'annotations', 'breaks', 'marks',
'headings', 'directives'
parse_yaml_header: If True, parse YAML front matter and store it
on doc.header while stripping it from the SSMD body. If False,
YAML front matter is preserved as part of the content.
strict: If True, emit warnings and apply ssml-green validation
rules where possible.
Example:
>>> doc = ssmd.Document("Hello *world*!")
>>> doc = ssmd.Document(capabilities='pyttsx3')
>>> doc = ssmd.Document("Text", config={'auto_sentence_tags': True})
>>> # Fast sentence detection (no spaCy required)
>>> doc = ssmd.Document(config={'sentence_use_spacy': False})
>>> # High quality sentence detection
>>> doc = ssmd.Document(config={'sentence_model_size': 'lg'})
>>> # Escape SSMD syntax for plain text/markdown
>>> doc = ssmd.Document(markdown, escape_syntax=True)
>>> # Selective escaping
>>> doc = ssmd.Document(
... text,
... escape_syntax=True,
... escape_patterns=['emphasis', 'annotations']
... )
"""
self._fragments: list[str] = []
self._separators: list[str] = []
self._config = config or {}
self._capabilities = capabilities
self._capabilities_obj: TTSCapabilities | None = None # Resolved capabilities
self._cached_ssml: str | None = None
self._cached_sentences: list[str] | None = None
self._cached_paragraphs: list[Paragraph] | None = None
self._escape_syntax = escape_syntax
self._escape_patterns = escape_patterns
self._strict = strict
self._parse_yaml_header = parse_yaml_header
self.header: dict[str, Any] | None = None
self.warnings: list[str] = []
# Add initial content if provided
if content:
header_config: dict[str, Any] = {}
if parse_yaml_header:
header, content = parse_yaml_front_matter(content)
if header is not None:
self.header = header
header_config = build_config_from_header(header)
content = content.lstrip("\n")
if escape_syntax:
from ssmd.utils import escape_ssmd_syntax
content = escape_ssmd_syntax(content, patterns=escape_patterns)
self._config.update(header_config)
self._fragments.append(content)
[docs]
@classmethod
def from_ssml(
cls,
ssml: str,
config: dict[str, Any] | None = None,
capabilities: "TTSCapabilities | str | None" = None,
) -> "Document":
"""Create a Document from SSML string.
Args:
ssml: SSML XML string
config: Optional configuration parameters
capabilities: Optional TTS capabilities
Returns:
New Document instance with converted content
Example:
>>> ssml = '<speak><emphasis>Hello</emphasis> world</speak>'
>>> doc = ssmd.Document.from_ssml(ssml)
>>> doc.ssmd
'*Hello* world'
"""
from ssmd.ssml_parser import SSMLParser
parser = SSMLParser(config or {})
ssmd_content = parser.to_ssmd(ssml, capabilities=capabilities)
return cls(ssmd_content, config, capabilities, parse_yaml_header=False)
[docs]
@classmethod
def from_text(
cls,
text: str,
config: dict[str, Any] | None = None,
capabilities: "TTSCapabilities | str | None" = None,
parse_yaml_header: bool = False,
strict: bool = False,
) -> "Document":
"""Create a Document from plain text.
This is essentially the same as Document(text), but provides
a symmetric API with from_ssml().
Args:
text: Plain text or SSMD content
config: Optional configuration parameters
capabilities: Optional TTS capabilities
Returns:
New Document instance
Example:
>>> doc = ssmd.Document.from_text("Hello world")
>>> doc.ssmd
'Hello world'
"""
return cls(
text,
config,
capabilities,
parse_yaml_header=parse_yaml_header,
strict=strict,
)
# ═══════════════════════════════════════════════════════════
# BUILDING METHODS
# ═══════════════════════════════════════════════════════════
[docs]
def add(self, text: str) -> "Document":
"""Append text without separator.
Use this when you want to append content immediately after
the previous content with no spacing.
Args:
text: SSMD text to append
Returns:
Self for method chaining
Example:
>>> doc = ssmd.Document("Hello")
>>> doc.add(" world")
>>> doc.ssmd
'Hello world'
"""
if not text:
return self
self._invalidate_cache()
if not self._fragments:
self._fragments.append(text)
else:
self._separators.append("")
self._fragments.append(text)
return self
[docs]
def add_sentence(self, text: str) -> "Document":
"""Append text with newline separator.
Use this to add a new sentence on a new line.
Args:
text: SSMD text to append
Returns:
Self for method chaining
Example:
>>> doc = ssmd.Document("First sentence.")
>>> doc.add_sentence("Second sentence.")
>>> doc.ssmd
'First sentence.\\nSecond sentence.'
"""
if not text:
return self
self._invalidate_cache()
if not self._fragments:
self._fragments.append(text)
else:
self._separators.append("\n")
self._fragments.append(text)
return self
[docs]
def add_paragraph(self, text: str) -> "Document":
"""Append text with double newline separator.
Use this to start a new paragraph.
Args:
text: SSMD text to append
Returns:
Self for method chaining
Example:
>>> doc = ssmd.Document("First paragraph.")
>>> doc.add_paragraph("Second paragraph.")
>>> doc.ssmd
'First paragraph.\\n\\nSecond paragraph.'
"""
if not text:
return self
self._invalidate_cache()
if not self._fragments:
self._fragments.append(text)
else:
self._separators.append("\n\n")
self._fragments.append(text)
return self
# ═══════════════════════════════════════════════════════════
# EXPORT METHODS
# ═══════════════════════════════════════════════════════════
[docs]
def to_ssml(self) -> str:
"""Export document to SSML format.
Returns:
SSML XML string
Example:
>>> doc = ssmd.Document("Hello *world*!")
>>> doc.to_ssml()
'<speak>Hello <emphasis>world</emphasis>!</speak>'
"""
if self._cached_ssml is None:
ssmd_content = self.ssmd
# Get resolved capabilities
capabilities = self._get_capabilities()
# Get config options
output_speak_tag = self._config.get("output_speak_tag", True)
auto_sentence_tags = self._config.get("auto_sentence_tags", False)
pretty_print = self._config.get("pretty_print", False)
extensions = self._config.get("extensions")
heading_levels = self._config.get("heading_levels")
# Get sentence detection config
model_size = self._config.get("sentence_model_size")
spacy_model = self._config.get("sentence_spacy_model")
use_spacy = self._config.get("sentence_use_spacy")
# Parse SSMD into sentences (with placeholders if escape_syntax=True)
sentences = parse_sentences(
ssmd_content,
capabilities=capabilities,
model_size=model_size,
spacy_model=spacy_model,
use_spacy=use_spacy,
heading_levels=heading_levels,
extensions=extensions,
)
namespaces = self._collect_namespaces(sentences, extensions, capabilities)
# Build SSML from sentences
ssml_parts: list[str] = []
paragraph_parts: list[str] = []
paragraph_enabled = not capabilities or capabilities.paragraph
def flush_paragraph() -> None:
if not paragraph_parts:
return
paragraph_content = " ".join(paragraph_parts).strip()
if paragraph_enabled:
ssml_parts.append(f"<p>{paragraph_content}</p>")
else:
ssml_parts.append(paragraph_content)
paragraph_parts.clear()
for sentence in sentences:
sentence_ssml = sentence.to_ssml(
capabilities=capabilities,
extensions=extensions,
wrap_sentence=auto_sentence_tags,
warnings=self.warnings if self._strict else None,
)
if paragraph_enabled:
paragraph_parts.append(sentence_ssml)
if sentence.is_paragraph_end:
flush_paragraph()
else:
ssml_parts.append(sentence_ssml)
if paragraph_enabled:
flush_paragraph()
if paragraph_enabled:
ssml = "".join(ssml_parts)
else:
ssml = " ".join(ssml_parts)
# Wrap in <speak> tags if configured
if output_speak_tag:
if (
"amazon:" in ssml
and "amazon" not in namespaces
and "xmlns:amazon" not in ssml
):
namespaces = {
**namespaces,
"amazon": "https://amazon.com/ssml",
}
namespace_attrs = self._format_namespace_attrs(namespaces)
ssml = f"<speak{namespace_attrs}>{ssml}</speak>"
# Unescape placeholders AFTER generating SSML
# (restore original characters in output)
if self._escape_syntax:
from ssmd.utils import unescape_ssmd_syntax
ssml = unescape_ssmd_syntax(ssml, xml_safe=True)
# Pretty print if configured
if pretty_print:
ssml = format_xml(ssml, pretty=True)
self._cached_ssml = ssml
return self._cached_ssml
[docs]
def to_ssmd(self) -> str:
"""Export document to SSMD format with proper formatting.
Returns SSMD with proper line breaks (each sentence on a new line).
Returns:
SSMD markdown string with proper formatting
Example:
>>> doc = ssmd.Document.from_ssml('<speak><emphasis>Hi</emphasis></speak>')
>>> doc.to_ssmd()
'*Hi*'
"""
raw_ssmd = self.ssmd
if not raw_ssmd.strip():
return raw_ssmd
# Parse into sentences and format with proper line breaks
sentences = self._parse_sentence_objects()
formatted = format_ssmd(sentences).rstrip("\n")
if self._escape_syntax:
from ssmd.utils import unescape_ssmd_syntax
formatted = unescape_ssmd_syntax(formatted)
return formatted
[docs]
def to_text(self) -> str:
"""Export document to plain text (strips all markup).
Returns:
Plain text string with all SSMD markup removed
Example:
>>> doc = ssmd.Document("Hello *world* @marker!")
>>> doc.to_text()
'Hello world!'
"""
sentences = self._parse_sentence_objects()
text_parts = []
for sentence in sentences:
text_parts.append(sentence.to_text())
text = " ".join(text_parts)
if self._escape_syntax:
from ssmd.utils import unescape_ssmd_syntax
text = unescape_ssmd_syntax(text)
return text
# ═══════════════════════════════════════════════════════════
# PROPERTIES
# ═══════════════════════════════════════════════════════════
@property
def ssmd(self) -> str:
"""Get raw SSMD content.
Returns the complete SSMD document by joining all fragments
with their separators.
Returns:
SSMD markdown string
"""
if not self._fragments:
return ""
if len(self._fragments) == 1:
return self._fragments[0]
result = self._fragments[0]
for i, separator in enumerate(self._separators):
result += separator + self._fragments[i + 1]
return result
@property
def config(self) -> dict[str, Any]:
"""Get configuration dictionary.
Returns:
Configuration dict
"""
return self._config
@config.setter
def config(self, value: dict[str, Any]) -> None:
"""Set configuration dictionary.
Args:
value: New configuration dict
"""
self._config = value
self._capabilities_obj = None # Reset resolved capabilities
self._invalidate_cache()
@property
def capabilities(self) -> "TTSCapabilities | str | None":
"""Get TTS capabilities.
Returns:
TTSCapabilities instance, preset name, or None
"""
return self._capabilities
@capabilities.setter
def capabilities(self, value: "TTSCapabilities | str | None") -> None:
"""Set TTS capabilities.
Args:
value: TTSCapabilities instance, preset name, or None
"""
self._capabilities = value
self._capabilities_obj = None # Reset resolved capabilities
self._invalidate_cache()
# ═══════════════════════════════════════════════════════════
# ITERATION
# ═══════════════════════════════════════════════════════════
[docs]
def sentences(self, as_documents: bool = False) -> "Iterator[str | Document]":
"""Iterate through sentence-level SSML chunks.
Sentences are always returned as explicit ``<s>``-wrapped SSML fragments
so the streaming interface remains stable regardless of paragraph tags.
Args:
as_documents: If True, yield Document objects instead of strings.
Each sentence will be wrapped in its own Document instance.
Yields:
SSML sentence strings (str), or Document objects if as_documents=True
Example:
>>> doc = ssmd.Document("First. Second. Third.")
>>> for sentence in doc.sentences():
... tts_engine.speak(sentence)
>>> for sentence_doc in doc.sentences(as_documents=True):
... ssml = sentence_doc.to_ssml()
... ssmd = sentence_doc.to_ssmd()
"""
self._populate_sentence_cache()
for sentence in self._cached_sentences or []:
if as_documents:
yield Document.from_ssml(
sentence,
config=self._config,
capabilities=self._capabilities,
)
else:
yield sentence
[docs]
def paragraphs(self, as_documents: bool = False) -> "Iterator[str | Document]":
"""Iterate through paragraph-level SSML chunks.
Paragraphs are returned with ``<p>`` tags when supported by capabilities.
Args:
as_documents: If True, yield Document objects instead of strings.
Yields:
SSML paragraph strings (str), or Document objects if as_documents=True
"""
self._populate_paragraph_cache()
capabilities = self._get_capabilities()
extensions = self._config.get("extensions")
auto_sentence_tags = self._config.get("auto_sentence_tags", False)
paragraph_enabled = not capabilities or capabilities.paragraph
for paragraph in self._cached_paragraphs or []:
sentence_parts: list[str] = []
for sentence in paragraph.sentences:
sentence_parts.append(
sentence.to_ssml(
capabilities=capabilities,
extensions=extensions,
wrap_sentence=auto_sentence_tags,
warnings=self.warnings if self._strict else None,
)
)
content = " ".join(part for part in sentence_parts if part).strip()
if not content:
continue
paragraph_ssml = f"<p>{content}</p>" if paragraph_enabled else content
if self._escape_syntax:
from ssmd.utils import unescape_ssmd_syntax
paragraph_ssml = unescape_ssmd_syntax(paragraph_ssml, xml_safe=True)
if as_documents:
yield Document.from_ssml(
paragraph_ssml,
config=self._config,
capabilities=self._capabilities,
)
else:
yield paragraph_ssml
# ═══════════════════════════════════════════════════════════
# LIST-LIKE INTERFACE (operates on SSML sentences)
# ═══════════════════════════════════════════════════════════
[docs]
def __len__(self) -> int:
"""Return number of sentences in the document.
Returns:
Number of sentences
Example:
>>> doc = ssmd.Document("First sentence. Second sentence.")
>>> len(doc)
2
"""
if self._cached_sentences is not None:
return len(self._cached_sentences)
return len(self._parse_sentence_objects())
@overload
def __getitem__(self, index: int) -> str: ...
@overload
def __getitem__(self, index: slice) -> list[str]: ...
[docs]
def __getitem__(self, index: int | slice) -> str | list[str]:
"""Get sentence(s) by index.
Args:
index: Sentence index or slice
Returns:
SSML sentence string or list of strings
Raises:
IndexError: If index is out of range
Example:
>>> doc = ssmd.Document("First. Second. Third.")
>>> doc[0] # First sentence SSML
>>> doc[-1] # Last sentence SSML
>>> doc[0:2] # First two sentences
"""
self._populate_sentence_cache()
return (self._cached_sentences or [])[index]
[docs]
def __setitem__(self, index: int, value: str) -> None:
"""Replace sentence at index.
This reconstructs the document with the modified sentence.
Args:
index: Sentence index
value: New SSMD content for this sentence
Raises:
IndexError: If index is out of range
Example:
>>> doc = ssmd.Document("First. Second. Third.")
>>> doc[0] = "Modified first sentence."
"""
self._populate_sentence_cache()
sentences = self._cached_sentences or []
self._rebuild_from_sentence_ssml(
sentences,
replacement_index=index,
replacement_ssmd=value,
)
[docs]
def __delitem__(self, index: int) -> None:
"""Delete sentence at index.
Args:
index: Sentence index
Raises:
IndexError: If index is out of range
Example:
>>> doc = ssmd.Document("First. Second. Third.")
>>> del doc[1] # Remove second sentence
"""
self._populate_sentence_cache()
sentences = self._cached_sentences or []
remaining_sentences = [
sentence_ssml for i, sentence_ssml in enumerate(sentences) if i != index
]
self._rebuild_from_sentence_ssml(remaining_sentences)
[docs]
def __iter__(self) -> "Iterator[str | Document]":
"""Iterate through sentences.
Yields:
SSML sentence strings
Example:
>>> doc = ssmd.Document("First. Second.")
>>> for sentence in doc:
... print(sentence)
"""
return self.sentences(as_documents=False)
[docs]
def __iadd__(self, other: "str | Document") -> "Document":
"""Support += operator for appending content.
Args:
other: String or Document to append
Returns:
Self for chaining
Example:
>>> doc = ssmd.Document("Hello")
>>> doc += " world"
>>> other = ssmd.Document("More")
>>> doc += other
"""
if isinstance(other, Document):
# Append another document's content
return self.add(other.ssmd)
else:
# Append string
return self.add(other)
# ═══════════════════════════════════════════════════════════
# EDITING METHODS
# ═══════════════════════════════════════════════════════════
[docs]
def insert(self, index: int, text: str, separator: str = "") -> "Document":
"""Insert text at specific fragment index.
Args:
index: Position to insert (0 = beginning)
text: SSMD text to insert
separator: Separator to use ("", "\\n", or "\\n\\n")
Returns:
Self for method chaining
Example:
>>> doc = ssmd.Document("Hello world")
>>> doc.insert(0, "Start: ", "")
>>> doc.ssmd
'Start: Hello world'
"""
if not text:
return self
self._invalidate_cache()
if not self._fragments:
self._fragments.append(text)
elif index == 0:
# Insert at beginning
self._fragments.insert(0, text)
if len(self._fragments) > 1:
self._separators.insert(0, separator)
elif index >= len(self._fragments):
# Append at end
self._separators.append(separator)
self._fragments.append(text)
else:
# Insert in middle
self._fragments.insert(index, text)
self._separators.insert(index, separator)
return self
[docs]
def remove(self, index: int) -> "Document":
"""Remove fragment at index.
This is the same as `del doc[index]` but returns self for chaining.
Args:
index: Fragment index to remove
Returns:
Self for method chaining
Raises:
IndexError: If index is out of range
Example:
>>> doc = ssmd.Document("First. Second. Third.")
>>> doc.remove(1)
"""
del self[index]
return self
[docs]
def clear(self) -> "Document":
"""Remove all content from the document.
Returns:
Self for method chaining
Example:
>>> doc = ssmd.Document("Hello world")
>>> doc.clear()
>>> doc.ssmd
''
"""
self._fragments.clear()
self._separators.clear()
self._invalidate_cache()
return self
[docs]
def replace(self, old: str, new: str, count: int = -1) -> "Document":
"""Replace text across all fragments.
Args:
old: Text to find
new: Text to replace with
count: Maximum replacements (-1 = all)
Returns:
Self for method chaining
Example:
>>> doc = ssmd.Document("Hello world. Hello again.")
>>> doc.replace("Hello", "Hi")
>>> doc.ssmd
'Hi world. Hi again.'
"""
self._invalidate_cache()
replacements_made = 0
for i, fragment in enumerate(self._fragments):
if count == -1:
self._fragments[i] = fragment.replace(old, new)
else:
remaining = count - replacements_made
if remaining <= 0:
break
self._fragments[i] = fragment.replace(old, new, remaining)
replacements_made += self._fragments[i].count(new) - fragment.count(new)
return self
# ═══════════════════════════════════════════════════════════
# ADVANCED METHODS
# ═══════════════════════════════════════════════════════════
[docs]
def merge(self, other: "Document", separator: str = "\n\n") -> "Document":
"""Merge another document into this one.
Args:
other: Document to merge
separator: Separator to use between documents
Returns:
Self for method chaining
Example:
>>> doc1 = ssmd.Document("First document.")
>>> doc2 = ssmd.Document("Second document.")
>>> doc1.merge(doc2)
>>> doc1.ssmd
'First document.\\n\\nSecond document.'
"""
if not other._fragments:
return self
self._invalidate_cache()
if not self._fragments:
self._fragments = other._fragments.copy()
self._separators = other._separators.copy()
else:
self._separators.append(separator)
self._fragments.extend(other._fragments)
self._separators.extend(other._separators)
return self
[docs]
def split(self) -> list["Document"]:
"""Split document into individual sentence Documents.
Returns:
List of Document objects, one per sentence
Example:
>>> doc = ssmd.Document("First. Second. Third.")
>>> sentences = doc.split()
>>> len(sentences)
3
>>> sentences[0].ssmd
'First.'
"""
return [
Document.from_ssml(
str(sentence_ssml), # Ensure it's a string
config=self._config,
capabilities=self._capabilities,
)
for sentence_ssml in self.sentences(as_documents=False)
]
[docs]
def get_fragment(self, index: int) -> str:
"""Get raw fragment by index (not sentence).
This accesses the internal fragment storage directly,
which may be different from sentence boundaries.
Args:
index: Fragment index
Returns:
Raw SSMD fragment string
Raises:
IndexError: If index is out of range
Example:
>>> doc = ssmd.Document()
>>> doc.add("First")
>>> doc.add_sentence("Second")
>>> doc.get_fragment(0)
'First'
>>> doc.get_fragment(1)
'Second'
"""
return self._fragments[index]
# ═══════════════════════════════════════════════════════════
# INTERNAL HELPERS
# ═══════════════════════════════════════════════════════════
def _rebuild_from_sentence_ssml(
self,
sentences: list[str],
*,
replacement_index: int | None = None,
replacement_ssmd: str | None = None,
) -> None:
"""Rebuild fragments from SSML sentence list.
Args:
sentences: List of SSML sentence strings
replacement_index: Optional index to replace with SSMD content
replacement_ssmd: SSMD content to use at replacement_index
"""
from ssmd.ssml_parser import SSMLParser
parser = SSMLParser(self._config)
new_fragments: list[str] = []
new_separators: list[str] = []
for i, sentence_ssml in enumerate(sentences):
if replacement_index is not None and i == replacement_index:
if replacement_ssmd is not None:
new_fragments.append(replacement_ssmd)
else:
new_fragments.append(
parser.to_ssmd(sentence_ssml, capabilities=self._capabilities)
)
else:
new_fragments.append(
parser.to_ssmd(sentence_ssml, capabilities=self._capabilities)
)
if i < len(sentences) - 1:
new_separators.append("\n")
self._fragments = new_fragments
self._separators = new_separators
self._invalidate_cache()
def _sentence_detection_config(
self,
) -> tuple[str | None, str | None, bool | None, str | None]:
model_size = self._config.get("sentence_model_size")
spacy_model = self._config.get("sentence_spacy_model")
use_spacy = self._config.get("sentence_use_spacy")
model_size_value = model_size or (
spacy_model.split("_")[-1] if spacy_model else None
)
return model_size, spacy_model, use_spacy, model_size_value
def _parse_sentence_objects(self) -> list["Sentence"]:
model_size, spacy_model, use_spacy, _ = self._sentence_detection_config()
return parse_sentences(
self.ssmd,
capabilities=self._get_capabilities(),
model_size=model_size,
spacy_model=spacy_model,
use_spacy=use_spacy,
heading_levels=self._config.get("heading_levels"),
extensions=self._config.get("extensions"),
parse_yaml_header=self._parse_yaml_header,
strict_parse=self._strict,
)
def _parse_paragraph_objects(self) -> list[Paragraph]:
_, _, use_spacy, model_size_value = self._sentence_detection_config()
return parse_paragraphs(
self.ssmd,
capabilities=self._get_capabilities(),
heading_levels=self._config.get("heading_levels"),
extensions=self._config.get("extensions"),
use_spacy=use_spacy,
model_size=model_size_value,
parse_yaml_header=self._parse_yaml_header,
strict_parse=self._strict,
)
def _populate_sentence_cache(self) -> None:
if self._cached_sentences is not None:
return
capabilities = self._get_capabilities()
extensions = self._config.get("extensions")
sentence_objects = self._parse_sentence_objects()
sentence_ssml: list[str] = []
for sentence in sentence_objects:
sentence_ssml.append(
sentence.to_ssml(
capabilities=capabilities,
extensions=extensions,
wrap_sentence=True,
warnings=self.warnings if self._strict else None,
)
)
if self._escape_syntax:
from ssmd.utils import unescape_ssmd_syntax
sentence_ssml = [
unescape_ssmd_syntax(sentence, xml_safe=True)
for sentence in sentence_ssml
]
self._cached_sentences = sentence_ssml
def _populate_paragraph_cache(self) -> None:
if self._cached_paragraphs is None:
self._cached_paragraphs = self._parse_paragraph_objects()
def _get_capabilities(self) -> "TTSCapabilities | None":
"""Get resolved TTSCapabilities object.
Returns:
TTSCapabilities instance or None
"""
if self._capabilities_obj is None and self._capabilities is not None:
from ssmd.capabilities import TTSCapabilities, get_preset
if isinstance(self._capabilities, str):
self._capabilities_obj = get_preset(self._capabilities)
elif isinstance(self._capabilities, TTSCapabilities):
self._capabilities_obj = self._capabilities
return self._capabilities_obj
def _collect_namespaces(
self,
sentences: list["Sentence"],
extensions: dict | None,
capabilities: "TTSCapabilities | None",
) -> dict[str, str]:
namespaces: dict[str, str] = dict(self._config.get("namespaces") or {})
from ssmd.segment import DEFAULT_EXTENSIONS
ext_handlers = {**DEFAULT_EXTENSIONS, **(extensions or {})}
used_extensions = {
segment.extension
for sentence in sentences
for segment in sentence.segments
if segment.extension
}
for extension_name in used_extensions:
if capabilities and not capabilities.supports_extension(extension_name):
continue
handler = ext_handlers.get(extension_name)
handler_namespaces = getattr(handler, "namespaces", None)
if handler_namespaces:
namespaces.update(handler_namespaces)
return namespaces
def _format_namespace_attrs(self, namespaces: dict[str, str]) -> str:
if not namespaces:
return ""
attrs = " ".join(
f'xmlns:{prefix}="{uri}"' for prefix, uri in sorted(namespaces.items())
)
return f" {attrs}"
def _invalidate_cache(self) -> None:
"""Invalidate cached SSML and sentences."""
self._cached_ssml = None
self._cached_sentences = None
self._cached_paragraphs = None
[docs]
def __repr__(self) -> str:
"""String representation of document.
Returns:
Representation string
Example:
>>> doc = ssmd.Document("Hello.\nWorld.")
>>> repr(doc)
'Document(2 sentences, 13 chars)'
"""
try:
num_sentences = len(self)
return f"Document({num_sentences} sentences, {len(self.ssmd)} chars)"
except Exception:
return f"Document({len(self.ssmd)} chars)"
[docs]
def __str__(self) -> str:
"""String conversion returns SSMD content.
Returns:
SSMD string
Example:
>>> doc = ssmd.Document("Hello *world*")
>>> str(doc)
'Hello *world*'
"""
return self.ssmd