"""Speech synthesis service interface. This module defines the interface for text-to-speech (TTS) services that convert textual content into audio. The interface supports both batch and streaming synthesis with multiple voice options and quality settings. The interface is designed to be: - Voice-flexible: Supports multiple voices and languages - Quality-configurable: Allows control over synthesis parameters - Streaming-capable: Supports real-time audio generation - Provider-agnostic: Works with any TTS implementation """ from abc import ABC, abstractmethod from typing import Iterator, TYPE_CHECKING if TYPE_CHECKING: from ..models.speech_synthesis_request import SpeechSynthesisRequest from ..models.audio_content import AudioContent from ..models.audio_chunk import AudioChunk class ISpeechSynthesisService(ABC): """Interface for speech synthesis services. This interface defines the contract for converting text to speech using various TTS models and voices. Implementations should support both batch processing and streaming synthesis for different use cases. Example: ```python # Use through dependency injection tts_service = container.resolve(ISpeechSynthesisService) # Create synthesis request request = SpeechSynthesisRequest( text_content=text_content, voice_settings=voice_settings ) # Batch synthesis audio = tts_service.synthesize(request) # Or streaming synthesis for chunk in tts_service.synthesize_stream(request): # Process audio chunk in real-time play_audio_chunk(chunk) ``` """ @abstractmethod def synthesize(self, request: 'SpeechSynthesisRequest') -> 'AudioContent': """Synthesize speech from text in batch mode. Converts text content to audio using specified voice settings and returns the complete audio content. This method is suitable for shorter texts or when the complete audio is needed before playback. Implementation considerations: - Text preprocessing (SSML support, pronunciation handling) - Voice loading and configuration - Audio quality optimization - Memory management for long texts - Error recovery and fallback voices Args: request: The speech synthesis request containing: - text_content: Text to synthesize with language information - voice_settings: Voice configuration including voice ID, speed, pitch, volume, and other voice-specific parameters Returns: AudioContent: The synthesized audio containing: - data: Raw audio data in specified format - format: Audio format (WAV, MP3, etc.) - sample_rate: Audio sample rate in Hz - duration: Audio duration in seconds - metadata: Additional synthesis information Raises: SpeechSynthesisException: If synthesis fails due to: - Unsupported voice or language - Text processing errors (invalid characters, length limits) - Voice model loading failures - Insufficient system resources ValueError: If request parameters are invalid: - Empty text content - Unsupported voice settings - Invalid audio format specifications Example: ```python # Create text content text = TextContent( text="Hello, this is a test of speech synthesis.", language="en" ) # Configure voice settings voice_settings = VoiceSettings( voice_id="kokoro", speed=1.0, pitch=0.0, volume=1.0 ) # Create synthesis request request = SpeechSynthesisRequest( text_content=text, voice_settings=voice_settings ) # Synthesize audio try: audio = service.synthesize(request) # Save to file with open("output.wav", "wb") as f: f.write(audio.data) print(f"Generated {audio.duration:.1f}s of audio") except SpeechSynthesisException as e: print(f"Synthesis failed: {e}") ``` """ pass @abstractmethod def synthesize_stream(self, request: 'SpeechSynthesisRequest') -> Iterator['AudioChunk']: """Synthesize speech from text as a stream of audio chunks. Converts text content to audio in streaming mode, yielding audio chunks as they become available. This method is suitable for real-time playback, long texts, or when low latency is required. Implementation considerations: - Chunk size optimization for smooth playback - Buffer management and memory efficiency - Error handling without breaking the stream - Proper stream termination and cleanup - Latency minimization for real-time use cases Args: request: The speech synthesis request containing text and voice settings. Same format as batch synthesis but optimized for streaming. Yields: AudioChunk: Individual audio chunks containing: - data: Raw audio data for this chunk - format: Audio format (consistent across chunks) - sample_rate: Audio sample rate in Hz - chunk_index: Sequential chunk number - is_final: Boolean indicating if this is the last chunk - timestamp: Chunk generation timestamp Raises: SpeechSynthesisException: If synthesis fails during streaming: - Voice model errors during processing - Network issues (for cloud-based synthesis) - Resource exhaustion during long synthesis ValueError: If request parameters are invalid for streaming Example: ```python # Create streaming synthesis request request = SpeechSynthesisRequest( text_content=long_text, voice_settings=voice_settings ) # Stream synthesis with real-time playback audio_buffer = [] try: for chunk in service.synthesize_stream(request): # Add to playback buffer audio_buffer.append(chunk.data) # Start playback when buffer is sufficient if len(audio_buffer) >= 3: # Buffer 3 chunks play_audio_chunk(audio_buffer.pop(0)) # Handle final chunk if chunk.is_final: # Play remaining buffered chunks for remaining in audio_buffer: play_audio_chunk(remaining) break except SpeechSynthesisException as e: print(f"Streaming synthesis failed: {e}") ``` """ pass