Spaces:
Build error
Build error
File size: 7,325 Bytes
4e4961e 5009cb8 55e29e2 5009cb8 4e4961e 5009cb8 4e4961e 5009cb8 4e4961e 5009cb8 4e4961e 5009cb8 4e4961e 5009cb8 4e4961e 5009cb8 4e4961e 5009cb8 4e4961e 5009cb8 4e4961e 5009cb8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
"""Speech synthesis service interface.
This module defines the interface for text-to-speech (TTS) services that convert
textual content into audio. The interface supports both batch and streaming
synthesis with multiple voice options and quality settings.
The interface is designed to be:
- Voice-flexible: Supports multiple voices and languages
- Quality-configurable: Allows control over synthesis parameters
- Streaming-capable: Supports real-time audio generation
- Provider-agnostic: Works with any TTS implementation
"""
from abc import ABC, abstractmethod
from typing import Iterator, TYPE_CHECKING
if TYPE_CHECKING:
from ..models.speech_synthesis_request import SpeechSynthesisRequest
from ..models.audio_content import AudioContent
from ..models.audio_chunk import AudioChunk
class ISpeechSynthesisService(ABC):
"""Interface for speech synthesis services.
This interface defines the contract for converting text to speech using
various TTS models and voices. Implementations should support both batch
processing and streaming synthesis for different use cases.
Example:
```python
# Use through dependency injection
tts_service = container.resolve(ISpeechSynthesisService)
# Create synthesis request
request = SpeechSynthesisRequest(
text_content=text_content,
voice_settings=voice_settings
)
# Batch synthesis
audio = tts_service.synthesize(request)
# Or streaming synthesis
for chunk in tts_service.synthesize_stream(request):
# Process audio chunk in real-time
play_audio_chunk(chunk)
```
"""
@abstractmethod
def synthesize(self, request: 'SpeechSynthesisRequest') -> 'AudioContent':
"""Synthesize speech from text in batch mode.
Converts text content to audio using specified voice settings and
returns the complete audio content. This method is suitable for
shorter texts or when the complete audio is needed before playback.
Implementation considerations:
- Text preprocessing (SSML support, pronunciation handling)
- Voice loading and configuration
- Audio quality optimization
- Memory management for long texts
- Error recovery and fallback voices
Args:
request: The speech synthesis request containing:
- text_content: Text to synthesize with language information
- voice_settings: Voice configuration including voice ID, speed,
pitch, volume, and other voice-specific parameters
Returns:
AudioContent: The synthesized audio containing:
- data: Raw audio data in specified format
- format: Audio format (WAV, MP3, etc.)
- sample_rate: Audio sample rate in Hz
- duration: Audio duration in seconds
- metadata: Additional synthesis information
Raises:
SpeechSynthesisException: If synthesis fails due to:
- Unsupported voice or language
- Text processing errors (invalid characters, length limits)
- Voice model loading failures
- Insufficient system resources
ValueError: If request parameters are invalid:
- Empty text content
- Unsupported voice settings
- Invalid audio format specifications
Example:
```python
# Create text content
text = TextContent(
text="Hello, this is a test of speech synthesis.",
language="en"
)
# Configure voice settings
voice_settings = VoiceSettings(
voice_id="kokoro",
speed=1.0,
pitch=0.0,
volume=1.0
)
# Create synthesis request
request = SpeechSynthesisRequest(
text_content=text,
voice_settings=voice_settings
)
# Synthesize audio
try:
audio = service.synthesize(request)
# Save to file
with open("output.wav", "wb") as f:
f.write(audio.data)
print(f"Generated {audio.duration:.1f}s of audio")
except SpeechSynthesisException as e:
print(f"Synthesis failed: {e}")
```
"""
pass
@abstractmethod
def synthesize_stream(self, request: 'SpeechSynthesisRequest') -> Iterator['AudioChunk']:
"""Synthesize speech from text as a stream of audio chunks.
Converts text content to audio in streaming mode, yielding audio chunks
as they become available. This method is suitable for real-time playback,
long texts, or when low latency is required.
Implementation considerations:
- Chunk size optimization for smooth playback
- Buffer management and memory efficiency
- Error handling without breaking the stream
- Proper stream termination and cleanup
- Latency minimization for real-time use cases
Args:
request: The speech synthesis request containing text and voice settings.
Same format as batch synthesis but optimized for streaming.
Yields:
AudioChunk: Individual audio chunks containing:
- data: Raw audio data for this chunk
- format: Audio format (consistent across chunks)
- sample_rate: Audio sample rate in Hz
- chunk_index: Sequential chunk number
- is_final: Boolean indicating if this is the last chunk
- timestamp: Chunk generation timestamp
Raises:
SpeechSynthesisException: If synthesis fails during streaming:
- Voice model errors during processing
- Network issues (for cloud-based synthesis)
- Resource exhaustion during long synthesis
ValueError: If request parameters are invalid for streaming
Example:
```python
# Create streaming synthesis request
request = SpeechSynthesisRequest(
text_content=long_text,
voice_settings=voice_settings
)
# Stream synthesis with real-time playback
audio_buffer = []
try:
for chunk in service.synthesize_stream(request):
# Add to playback buffer
audio_buffer.append(chunk.data)
# Start playback when buffer is sufficient
if len(audio_buffer) >= 3: # Buffer 3 chunks
play_audio_chunk(audio_buffer.pop(0))
# Handle final chunk
if chunk.is_final:
# Play remaining buffered chunks
for remaining in audio_buffer:
play_audio_chunk(remaining)
break
except SpeechSynthesisException as e:
print(f"Streaming synthesis failed: {e}")
```
"""
pass |