Spaces:

DroolingPanda
/

teachingAssistant

Build error

teachingAssistant / src /domain /interfaces /speech_recognition.py

Michael Hu

Add documentation and final validation

4e4961e 13 days ago

4.47 kB

	"""Speech recognition service interface.

	This module defines the interface for speech-to-text (STT) services that convert
	audio content into textual representation. The interface supports multiple STT
	models and providers with consistent error handling.

	The interface is designed to be:
	- Model-agnostic: Works with any STT implementation (Whisper, Parakeet, etc.)
	- Language-aware: Handles multiple languages and dialects
	- Error-resilient: Provides detailed error information for debugging
	- Performance-conscious: Supports both batch and streaming transcription
	"""

	from abc import ABC, abstractmethod
	from typing import TYPE_CHECKING

	if TYPE_CHECKING:
	from ..models.audio_content import AudioContent
	from ..models.text_content import TextContent


	class ISpeechRecognitionService(ABC):
	"""Interface for speech recognition services.

	This interface defines the contract for converting audio content to text
	using various STT models and providers. Implementations should handle
	different audio formats, languages, and quality levels.

	Example:
	```python
	# Use through dependency injection
	stt_service = container.resolve(ISpeechRecognitionService)

	# Transcribe audio
	text_result = stt_service.transcribe(
	audio=audio_content,
	model="whisper-large"
	)

	print(f"Transcribed: {text_result.text}")
	print(f"Language: {text_result.language}")
	print(f"Confidence: {text_result.confidence}")
	```
	"""

	@abstractmethod
	def transcribe(self, audio: 'AudioContent', model: str) -> 'TextContent':
	"""Transcribe audio content to text using specified STT model.

	Converts audio data into textual representation with language detection
	and confidence scoring. The method should handle various audio formats
	and quality levels gracefully.

	Implementation considerations:
	- Audio preprocessing (noise reduction, normalization)
	- Language detection and handling
	- Confidence scoring and quality assessment
	- Memory management for large audio files
	- Timeout handling for long audio content

	Args:
	audio: The audio content to transcribe. Must contain valid audio data
	in a supported format (WAV, MP3, FLAC, etc.) with appropriate
	sample rate and duration.
	model: The STT model identifier to use for transcription. Examples:
	- "whisper-small": Fast, lower accuracy
	- "whisper-large": Slower, higher accuracy
	- "parakeet": Real-time optimized
	Must be supported by the implementation.

	Returns:
	TextContent: The transcription result containing:
	- text: The transcribed text content
	- language: Detected or specified language code
	- confidence: Overall transcription confidence (0.0-1.0)
	- metadata: Additional information like word-level timestamps,
	alternative transcriptions, processing time

	Raises:
	SpeechRecognitionException: If transcription fails due to:
	- Unsupported audio format or quality
	- Model loading or inference errors
	- Network issues (for cloud-based models)
	- Insufficient system resources
	ValueError: If input parameters are invalid:
	- Empty or corrupted audio data
	- Unsupported model identifier
	- Invalid audio format specifications

	Example:
	```python
	# Load audio file
	with open("speech.wav", "rb") as f:
	audio = AudioContent(
	data=f.read(),
	format="wav",
	sample_rate=16000,
	duration=30.0
	)

	# Transcribe with high-accuracy model
	try:
	result = service.transcribe(audio, "whisper-large")

	if result.confidence > 0.8:
	print(f"High confidence: {result.text}")
	else:
	print(f"Low confidence: {result.text} ({result.confidence:.2f})")

	except SpeechRecognitionException as e:
	print(f"Transcription failed: {e}")
	```
	"""
	pass