Spaces:
Build error
Build error
File size: 8,220 Bytes
6aea21a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
"""Concrete implementation of audio processing service."""
import time
from typing import TYPE_CHECKING
from ..interfaces.audio_processing import IAudioProcessingService
from ..interfaces.speech_recognition import ISpeechRecognitionService
from ..interfaces.translation import ITranslationService
from ..interfaces.speech_synthesis import ISpeechSynthesisService
from ..models.processing_result import ProcessingResult
from ..models.translation_request import TranslationRequest
from ..models.speech_synthesis_request import SpeechSynthesisRequest
from ..exceptions import (
AudioProcessingException,
SpeechRecognitionException,
TranslationFailedException,
SpeechSynthesisException
)
if TYPE_CHECKING:
from ..models.audio_content import AudioContent
from ..models.voice_settings import VoiceSettings
class AudioProcessingService(IAudioProcessingService):
"""Concrete implementation of audio processing pipeline orchestration."""
def __init__(
self,
speech_recognition_service: ISpeechRecognitionService,
translation_service: ITranslationService,
speech_synthesis_service: ISpeechSynthesisService
):
"""
Initialize the audio processing service with injected dependencies.
Args:
speech_recognition_service: Service for speech-to-text conversion
translation_service: Service for text translation
speech_synthesis_service: Service for text-to-speech synthesis
"""
self._speech_recognition_service = speech_recognition_service
self._translation_service = translation_service
self._speech_synthesis_service = speech_synthesis_service
def process_audio_pipeline(
self,
audio: 'AudioContent',
target_language: str,
voice_settings: 'VoiceSettings'
) -> 'ProcessingResult':
"""
Process audio through the complete pipeline: STT -> Translation -> TTS.
Args:
audio: The input audio content
target_language: The target language for translation
voice_settings: Voice settings for TTS synthesis
Returns:
ProcessingResult: The result of the complete processing pipeline
Raises:
AudioProcessingException: If any step in the pipeline fails
"""
start_time = time.time()
try:
# Validate inputs
self._validate_pipeline_inputs(audio, target_language, voice_settings)
# Step 1: Speech Recognition (STT)
original_text = self._perform_speech_recognition(audio)
# Step 2: Translation
translated_text = self._perform_translation(original_text, target_language)
# Step 3: Speech Synthesis (TTS)
audio_output = self._perform_speech_synthesis(translated_text, voice_settings)
# Calculate processing time
processing_time = time.time() - start_time
# Create successful result
return ProcessingResult.success_result(
original_text=original_text,
translated_text=translated_text,
audio_output=audio_output,
processing_time=processing_time
)
except (SpeechRecognitionException, TranslationFailedException, SpeechSynthesisException) as e:
# Handle domain-specific exceptions
processing_time = time.time() - start_time
return ProcessingResult.failure_result(
error_message=str(e),
processing_time=processing_time
)
except Exception as e:
# Handle unexpected exceptions
processing_time = time.time() - start_time
error_message = f"Unexpected error in audio processing pipeline: {str(e)}"
return ProcessingResult.failure_result(
error_message=error_message,
processing_time=processing_time
)
def _validate_pipeline_inputs(
self,
audio: 'AudioContent',
target_language: str,
voice_settings: 'VoiceSettings'
) -> None:
"""
Validate inputs for the audio processing pipeline.
Args:
audio: The input audio content
target_language: The target language for translation
voice_settings: Voice settings for TTS synthesis
Raises:
AudioProcessingException: If validation fails
"""
if audio is None:
raise AudioProcessingException("Audio content cannot be None")
if not target_language or not target_language.strip():
raise AudioProcessingException("Target language cannot be empty")
if voice_settings is None:
raise AudioProcessingException("Voice settings cannot be None")
# Validate that voice settings language matches target language
if voice_settings.language != target_language:
raise AudioProcessingException(
f"Voice settings language ({voice_settings.language}) must match "
f"target language ({target_language})"
)
# Validate audio duration for processing limits
if audio.duration > 300: # 5 minutes limit
raise AudioProcessingException(
f"Audio duration ({audio.duration:.1f}s) exceeds maximum allowed duration (300s)"
)
# Validate audio format is supported
if not audio.is_valid_format:
raise AudioProcessingException(f"Unsupported audio format: {audio.format}")
def _perform_speech_recognition(self, audio: 'AudioContent') -> 'TextContent':
"""
Perform speech recognition on the input audio.
Args:
audio: The input audio content
Returns:
TextContent: The transcribed text
Raises:
SpeechRecognitionException: If transcription fails
"""
try:
# Use a default STT model - this could be configurable in the future
model = "whisper-base" # Default model
return self._speech_recognition_service.transcribe(audio, model)
except Exception as e:
raise SpeechRecognitionException(f"Speech recognition failed: {str(e)}")
def _perform_translation(self, text: 'TextContent', target_language: str) -> 'TextContent':
"""
Perform translation of the transcribed text.
Args:
text: The text to translate
target_language: The target language for translation
Returns:
TextContent: The translated text
Raises:
TranslationFailedException: If translation fails
"""
try:
# Check if translation is needed
if text.language == target_language:
# No translation needed, return original text
return text
# Create translation request
translation_request = TranslationRequest(
source_text=text,
target_language=target_language
)
return self._translation_service.translate(translation_request)
except Exception as e:
raise TranslationFailedException(f"Translation failed: {str(e)}")
def _perform_speech_synthesis(
self,
text: 'TextContent',
voice_settings: 'VoiceSettings'
) -> 'AudioContent':
"""
Perform speech synthesis on the translated text.
Args:
text: The text to synthesize
voice_settings: Voice settings for synthesis
Returns:
AudioContent: The synthesized audio
Raises:
SpeechSynthesisException: If synthesis fails
"""
try:
# Create speech synthesis request
synthesis_request = SpeechSynthesisRequest(
text_content=text,
voice_settings=voice_settings
)
return self._speech_synthesis_service.synthesize(synthesis_request)
except Exception as e:
raise SpeechSynthesisException(f"Speech synthesis failed: {str(e)}") |