Spaces:
Build error
Build error
"""Concrete implementation of audio processing service.""" | |
import time | |
from typing import TYPE_CHECKING | |
from ..interfaces.audio_processing import IAudioProcessingService | |
from ..interfaces.speech_recognition import ISpeechRecognitionService | |
from ..interfaces.translation import ITranslationService | |
from ..interfaces.speech_synthesis import ISpeechSynthesisService | |
from ..models.processing_result import ProcessingResult | |
from ..models.translation_request import TranslationRequest | |
from ..models.speech_synthesis_request import SpeechSynthesisRequest | |
from ..exceptions import ( | |
AudioProcessingException, | |
SpeechRecognitionException, | |
TranslationFailedException, | |
SpeechSynthesisException | |
) | |
if TYPE_CHECKING: | |
from ..models.audio_content import AudioContent | |
from ..models.voice_settings import VoiceSettings | |
class AudioProcessingService(IAudioProcessingService): | |
"""Concrete implementation of audio processing pipeline orchestration.""" | |
def __init__( | |
self, | |
speech_recognition_service: ISpeechRecognitionService, | |
translation_service: ITranslationService, | |
speech_synthesis_service: ISpeechSynthesisService | |
): | |
""" | |
Initialize the audio processing service with injected dependencies. | |
Args: | |
speech_recognition_service: Service for speech-to-text conversion | |
translation_service: Service for text translation | |
speech_synthesis_service: Service for text-to-speech synthesis | |
""" | |
self._speech_recognition_service = speech_recognition_service | |
self._translation_service = translation_service | |
self._speech_synthesis_service = speech_synthesis_service | |
def process_audio_pipeline( | |
self, | |
audio: 'AudioContent', | |
target_language: str, | |
voice_settings: 'VoiceSettings' | |
) -> 'ProcessingResult': | |
""" | |
Process audio through the complete pipeline: STT -> Translation -> TTS. | |
Args: | |
audio: The input audio content | |
target_language: The target language for translation | |
voice_settings: Voice settings for TTS synthesis | |
Returns: | |
ProcessingResult: The result of the complete processing pipeline | |
Raises: | |
AudioProcessingException: If any step in the pipeline fails | |
""" | |
start_time = time.time() | |
try: | |
# Validate inputs | |
self._validate_pipeline_inputs(audio, target_language, voice_settings) | |
# Step 1: Speech Recognition (STT) | |
original_text = self._perform_speech_recognition(audio) | |
# Step 2: Translation | |
translated_text = self._perform_translation(original_text, target_language) | |
# Step 3: Speech Synthesis (TTS) | |
audio_output = self._perform_speech_synthesis(translated_text, voice_settings) | |
# Calculate processing time | |
processing_time = time.time() - start_time | |
# Create successful result | |
return ProcessingResult.success_result( | |
original_text=original_text, | |
translated_text=translated_text, | |
audio_output=audio_output, | |
processing_time=processing_time | |
) | |
except (SpeechRecognitionException, TranslationFailedException, SpeechSynthesisException) as e: | |
# Handle domain-specific exceptions | |
processing_time = time.time() - start_time | |
return ProcessingResult.failure_result( | |
error_message=str(e), | |
processing_time=processing_time | |
) | |
except Exception as e: | |
# Handle unexpected exceptions | |
processing_time = time.time() - start_time | |
error_message = f"Unexpected error in audio processing pipeline: {str(e)}" | |
return ProcessingResult.failure_result( | |
error_message=error_message, | |
processing_time=processing_time | |
) | |
def _validate_pipeline_inputs( | |
self, | |
audio: 'AudioContent', | |
target_language: str, | |
voice_settings: 'VoiceSettings' | |
) -> None: | |
""" | |
Validate inputs for the audio processing pipeline. | |
Args: | |
audio: The input audio content | |
target_language: The target language for translation | |
voice_settings: Voice settings for TTS synthesis | |
Raises: | |
AudioProcessingException: If validation fails | |
""" | |
if audio is None: | |
raise AudioProcessingException("Audio content cannot be None") | |
if not target_language or not target_language.strip(): | |
raise AudioProcessingException("Target language cannot be empty") | |
if voice_settings is None: | |
raise AudioProcessingException("Voice settings cannot be None") | |
# Validate that voice settings language matches target language | |
if voice_settings.language != target_language: | |
raise AudioProcessingException( | |
f"Voice settings language ({voice_settings.language}) must match " | |
f"target language ({target_language})" | |
) | |
# Validate audio duration for processing limits | |
if audio.duration > 300: # 5 minutes limit | |
raise AudioProcessingException( | |
f"Audio duration ({audio.duration:.1f}s) exceeds maximum allowed duration (300s)" | |
) | |
# Validate audio format is supported | |
if not audio.is_valid_format: | |
raise AudioProcessingException(f"Unsupported audio format: {audio.format}") | |
def _perform_speech_recognition(self, audio: 'AudioContent') -> 'TextContent': | |
""" | |
Perform speech recognition on the input audio. | |
Args: | |
audio: The input audio content | |
Returns: | |
TextContent: The transcribed text | |
Raises: | |
SpeechRecognitionException: If transcription fails | |
""" | |
try: | |
# Use a default STT model - this could be configurable in the future | |
model = "whisper-base" # Default model | |
return self._speech_recognition_service.transcribe(audio, model) | |
except Exception as e: | |
raise SpeechRecognitionException(f"Speech recognition failed: {str(e)}") | |
def _perform_translation(self, text: 'TextContent', target_language: str) -> 'TextContent': | |
""" | |
Perform translation of the transcribed text. | |
Args: | |
text: The text to translate | |
target_language: The target language for translation | |
Returns: | |
TextContent: The translated text | |
Raises: | |
TranslationFailedException: If translation fails | |
""" | |
try: | |
# Check if translation is needed | |
if text.language == target_language: | |
# No translation needed, return original text | |
return text | |
# Create translation request | |
translation_request = TranslationRequest( | |
source_text=text, | |
target_language=target_language | |
) | |
return self._translation_service.translate(translation_request) | |
except Exception as e: | |
raise TranslationFailedException(f"Translation failed: {str(e)}") | |
def _perform_speech_synthesis( | |
self, | |
text: 'TextContent', | |
voice_settings: 'VoiceSettings' | |
) -> 'AudioContent': | |
""" | |
Perform speech synthesis on the translated text. | |
Args: | |
text: The text to synthesize | |
voice_settings: Voice settings for synthesis | |
Returns: | |
AudioContent: The synthesized audio | |
Raises: | |
SpeechSynthesisException: If synthesis fails | |
""" | |
try: | |
# Create speech synthesis request | |
synthesis_request = SpeechSynthesisRequest( | |
text_content=text, | |
voice_settings=voice_settings | |
) | |
return self._speech_synthesis_service.synthesize(synthesis_request) | |
except Exception as e: | |
raise SpeechSynthesisException(f"Speech synthesis failed: {str(e)}") |