File size: 8,220 Bytes
6aea21a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""Concrete implementation of audio processing service."""

import time
from typing import TYPE_CHECKING

from ..interfaces.audio_processing import IAudioProcessingService
from ..interfaces.speech_recognition import ISpeechRecognitionService
from ..interfaces.translation import ITranslationService
from ..interfaces.speech_synthesis import ISpeechSynthesisService
from ..models.processing_result import ProcessingResult
from ..models.translation_request import TranslationRequest
from ..models.speech_synthesis_request import SpeechSynthesisRequest
from ..exceptions import (
    AudioProcessingException,
    SpeechRecognitionException,
    TranslationFailedException,
    SpeechSynthesisException
)

if TYPE_CHECKING:
    from ..models.audio_content import AudioContent
    from ..models.voice_settings import VoiceSettings


class AudioProcessingService(IAudioProcessingService):
    """Concrete implementation of audio processing pipeline orchestration."""

    def __init__(
        self,
        speech_recognition_service: ISpeechRecognitionService,
        translation_service: ITranslationService,
        speech_synthesis_service: ISpeechSynthesisService
    ):
        """
        Initialize the audio processing service with injected dependencies.

        Args:
            speech_recognition_service: Service for speech-to-text conversion
            translation_service: Service for text translation
            speech_synthesis_service: Service for text-to-speech synthesis
        """
        self._speech_recognition_service = speech_recognition_service
        self._translation_service = translation_service
        self._speech_synthesis_service = speech_synthesis_service

    def process_audio_pipeline(
        self,
        audio: 'AudioContent',
        target_language: str,
        voice_settings: 'VoiceSettings'
    ) -> 'ProcessingResult':
        """
        Process audio through the complete pipeline: STT -> Translation -> TTS.

        Args:
            audio: The input audio content
            target_language: The target language for translation
            voice_settings: Voice settings for TTS synthesis

        Returns:
            ProcessingResult: The result of the complete processing pipeline

        Raises:
            AudioProcessingException: If any step in the pipeline fails
        """
        start_time = time.time()

        try:
            # Validate inputs
            self._validate_pipeline_inputs(audio, target_language, voice_settings)

            # Step 1: Speech Recognition (STT)
            original_text = self._perform_speech_recognition(audio)

            # Step 2: Translation
            translated_text = self._perform_translation(original_text, target_language)

            # Step 3: Speech Synthesis (TTS)
            audio_output = self._perform_speech_synthesis(translated_text, voice_settings)

            # Calculate processing time
            processing_time = time.time() - start_time

            # Create successful result
            return ProcessingResult.success_result(
                original_text=original_text,
                translated_text=translated_text,
                audio_output=audio_output,
                processing_time=processing_time
            )

        except (SpeechRecognitionException, TranslationFailedException, SpeechSynthesisException) as e:
            # Handle domain-specific exceptions
            processing_time = time.time() - start_time
            return ProcessingResult.failure_result(
                error_message=str(e),
                processing_time=processing_time
            )
        except Exception as e:
            # Handle unexpected exceptions
            processing_time = time.time() - start_time
            error_message = f"Unexpected error in audio processing pipeline: {str(e)}"
            return ProcessingResult.failure_result(
                error_message=error_message,
                processing_time=processing_time
            )

    def _validate_pipeline_inputs(
        self,
        audio: 'AudioContent',
        target_language: str,
        voice_settings: 'VoiceSettings'
    ) -> None:
        """
        Validate inputs for the audio processing pipeline.

        Args:
            audio: The input audio content
            target_language: The target language for translation
            voice_settings: Voice settings for TTS synthesis

        Raises:
            AudioProcessingException: If validation fails
        """
        if audio is None:
            raise AudioProcessingException("Audio content cannot be None")

        if not target_language or not target_language.strip():
            raise AudioProcessingException("Target language cannot be empty")

        if voice_settings is None:
            raise AudioProcessingException("Voice settings cannot be None")

        # Validate that voice settings language matches target language
        if voice_settings.language != target_language:
            raise AudioProcessingException(
                f"Voice settings language ({voice_settings.language}) must match "
                f"target language ({target_language})"
            )

        # Validate audio duration for processing limits
        if audio.duration > 300:  # 5 minutes limit
            raise AudioProcessingException(
                f"Audio duration ({audio.duration:.1f}s) exceeds maximum allowed duration (300s)"
            )

        # Validate audio format is supported
        if not audio.is_valid_format:
            raise AudioProcessingException(f"Unsupported audio format: {audio.format}")

    def _perform_speech_recognition(self, audio: 'AudioContent') -> 'TextContent':
        """
        Perform speech recognition on the input audio.

        Args:
            audio: The input audio content

        Returns:
            TextContent: The transcribed text

        Raises:
            SpeechRecognitionException: If transcription fails
        """
        try:
            # Use a default STT model - this could be configurable in the future
            model = "whisper-base"  # Default model
            return self._speech_recognition_service.transcribe(audio, model)
        except Exception as e:
            raise SpeechRecognitionException(f"Speech recognition failed: {str(e)}")

    def _perform_translation(self, text: 'TextContent', target_language: str) -> 'TextContent':
        """
        Perform translation of the transcribed text.

        Args:
            text: The text to translate
            target_language: The target language for translation

        Returns:
            TextContent: The translated text

        Raises:
            TranslationFailedException: If translation fails
        """
        try:
            # Check if translation is needed
            if text.language == target_language:
                # No translation needed, return original text
                return text

            # Create translation request
            translation_request = TranslationRequest(
                source_text=text,
                target_language=target_language
            )

            return self._translation_service.translate(translation_request)
        except Exception as e:
            raise TranslationFailedException(f"Translation failed: {str(e)}")

    def _perform_speech_synthesis(
        self,
        text: 'TextContent',
        voice_settings: 'VoiceSettings'
    ) -> 'AudioContent':
        """
        Perform speech synthesis on the translated text.

        Args:
            text: The text to synthesize
            voice_settings: Voice settings for synthesis

        Returns:
            AudioContent: The synthesized audio

        Raises:
            SpeechSynthesisException: If synthesis fails
        """
        try:
            # Create speech synthesis request
            synthesis_request = SpeechSynthesisRequest(
                text_content=text,
                voice_settings=voice_settings
            )

            return self._speech_synthesis_service.synthesize(synthesis_request)
        except Exception as e:
            raise SpeechSynthesisException(f"Speech synthesis failed: {str(e)}")