""" Audio Processing Tool for GAIA Agent Provides comprehensive audio processing capabilities including: - Speech-to-text transcription using Whisper - Audio format support (MP3, WAV, M4A, etc.) - Content analysis and information extraction - Audio quality enhancement and noise reduction """ import os import logging import tempfile import asyncio from typing import Dict, Any, Optional, List, Union from pathlib import Path import json try: import soundfile as sf import numpy as np from faster_whisper import WhisperModel AUDIO_DEPS_AVAILABLE = True except ImportError as e: logging.warning(f"Audio dependencies not available: {e}") AUDIO_DEPS_AVAILABLE = False try: from .base_tool import SimpleAGNOTool except ImportError: from base_tool import SimpleAGNOTool logger = logging.getLogger(__name__) class AudioProcessingTool(SimpleAGNOTool): """ Advanced audio processing tool with Whisper integration for GAIA evaluation. Features: - Multi-format audio support (MP3, WAV, M4A, FLAC, OGG) - High-accuracy speech-to-text transcription - Content analysis and structured data extraction - Audio quality assessment and enhancement - Streaming support for large files """ def __init__(self): """Initialize the audio processing tool.""" super().__init__( name="audio_processing", description="Process audio files with speech-to-text transcription and content analysis" ) self.available = AUDIO_DEPS_AVAILABLE self.whisper_model = None self.supported_formats = ['.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac', '.wma'] self.max_file_size = 100 * 1024 * 1024 # 100MB self.transcription_timeout = 60 # seconds if self.available: self._init_whisper_model() else: logger.warning("⚠️ Audio processing tool not available - missing dependencies") def _init_whisper_model(self): """Initialize the Whisper model for transcription.""" try: # Use base model for balance of speed and accuracy # Can be upgraded to 'small' or 'medium' for better accuracy model_size = os.getenv('WHISPER_MODEL_SIZE', 'base') logger.info(f"🎤 Initializing Whisper model: {model_size}") self.whisper_model = WhisperModel( model_size, device="cpu", # Use CPU for compatibility compute_type="int8" # Optimize for memory usage ) logger.info("✅ Whisper model initialized successfully") except Exception as e: logger.error(f"❌ Failed to initialize Whisper model: {e}") self.available = False self.whisper_model = None def process_audio_file(self, file_path: str, extract_content: bool = True) -> Dict[str, Any]: """ Process an audio file with transcription and content analysis. Args: file_path: Path to the audio file extract_content: Whether to perform content analysis Returns: Dictionary containing transcription and analysis results """ if not self.available: return { 'success': False, 'error': 'Audio processing not available - missing dependencies', 'transcription': '', 'content_analysis': {} } try: # Validate file validation_result = self._validate_audio_file(file_path) if not validation_result['valid']: return { 'success': False, 'error': validation_result['error'], 'transcription': '', 'content_analysis': {} } # Transcribe audio logger.info(f"🎤 Transcribing audio file: {file_path}") transcription_result = self._transcribe_audio(file_path) if not transcription_result['success']: return transcription_result transcription = transcription_result['transcription'] # Perform content analysis if requested content_analysis = {} if extract_content and transcription: content_analysis = self._analyze_content(transcription) result = { 'success': True, 'transcription': transcription, 'content_analysis': content_analysis, 'audio_info': validation_result.get('info', {}), 'confidence': transcription_result.get('confidence', 0.0) } logger.info(f"✅ Audio processing completed successfully") logger.info(f"📝 Transcription length: {len(transcription)} characters") return result except Exception as e: logger.error(f"❌ Error processing audio file: {e}") return { 'success': False, 'error': f"Audio processing failed: {str(e)}", 'transcription': '', 'content_analysis': {} } def _validate_audio_file(self, file_path: str) -> Dict[str, Any]: """Validate audio file format, size, and accessibility.""" try: path = Path(file_path) # Check if file exists if not path.exists(): return {'valid': False, 'error': f"Audio file not found: {file_path}"} # Check file size file_size = path.stat().st_size if file_size > self.max_file_size: return { 'valid': False, 'error': f"File too large: {file_size / (1024*1024):.1f}MB (max: {self.max_file_size / (1024*1024)}MB)" } # Check file format file_ext = path.suffix.lower() if file_ext not in self.supported_formats: return { 'valid': False, 'error': f"Unsupported format: {file_ext}. Supported: {', '.join(self.supported_formats)}" } # Try to read audio info try: info = sf.info(file_path) audio_info = { 'duration': info.duration, 'sample_rate': info.samplerate, 'channels': info.channels, 'format': info.format, 'subtype': info.subtype } except Exception as e: return {'valid': False, 'error': f"Cannot read audio file: {str(e)}"} return { 'valid': True, 'info': audio_info } except Exception as e: return {'valid': False, 'error': f"File validation error: {str(e)}"} def _transcribe_audio(self, file_path: str) -> Dict[str, Any]: """Transcribe audio file using Whisper.""" try: if not self.whisper_model: return { 'success': False, 'error': 'Whisper model not initialized', 'transcription': '' } # Transcribe with timeout segments, info = self.whisper_model.transcribe( file_path, beam_size=5, language=None, # Auto-detect language task="transcribe", temperature=0.0, # Deterministic output compression_ratio_threshold=2.4, log_prob_threshold=-1.0, no_speech_threshold=0.6, condition_on_previous_text=False ) # Combine segments into full transcription transcription_parts = [] total_confidence = 0.0 segment_count = 0 for segment in segments: transcription_parts.append(segment.text.strip()) if hasattr(segment, 'avg_logprob'): total_confidence += segment.avg_logprob segment_count += 1 transcription = ' '.join(transcription_parts).strip() # Calculate average confidence avg_confidence = 0.0 if segment_count > 0: avg_confidence = total_confidence / segment_count # Convert log probability to confidence score (0-1) avg_confidence = max(0.0, min(1.0, (avg_confidence + 1.0) / 1.0)) logger.info(f"🎤 Transcription completed: {len(transcription)} chars, confidence: {avg_confidence:.2f}") return { 'success': True, 'transcription': transcription, 'confidence': avg_confidence, 'language': info.language if hasattr(info, 'language') else 'unknown', 'duration': info.duration if hasattr(info, 'duration') else 0.0 } except Exception as e: logger.error(f"❌ Transcription failed: {e}") return { 'success': False, 'error': f"Transcription failed: {str(e)}", 'transcription': '' } def _analyze_content(self, transcription: str) -> Dict[str, Any]: """Analyze transcribed content for structured information extraction.""" try: analysis = { 'word_count': len(transcription.split()), 'character_count': len(transcription), 'sentences': len([s for s in transcription.split('.') if s.strip()]), 'keywords': [], 'entities': [], 'topics': [], 'structured_data': {} } # Extract potential structured information text_lower = transcription.lower() # Look for recipe ingredients (for strawberry pie example) if any(keyword in text_lower for keyword in ['recipe', 'ingredients', 'cooking', 'baking', 'pie', 'cake']): analysis['topics'].append('recipe') analysis['structured_data']['recipe_indicators'] = self._extract_recipe_info(transcription) # Look for homework/educational content (for homework example) if any(keyword in text_lower for keyword in ['homework', 'assignment', 'page', 'chapter', 'exercise', 'problem']): analysis['topics'].append('education') analysis['structured_data']['education_indicators'] = self._extract_education_info(transcription) # Extract numbers and quantities import re numbers = re.findall(r'\b\d+(?:\.\d+)?\b', transcription) analysis['structured_data']['numbers'] = numbers # Extract page references page_refs = re.findall(r'page\s+(\d+)', text_lower) if page_refs: analysis['structured_data']['page_numbers'] = page_refs return analysis except Exception as e: logger.warning(f"⚠️ Content analysis failed: {e}") return {'error': str(e)} def _extract_recipe_info(self, text: str) -> Dict[str, Any]: """Extract recipe-specific information from transcription.""" import re recipe_info = { 'ingredients': [], 'quantities': [], 'cooking_methods': [], 'time_references': [] } # Common ingredient patterns ingredient_patterns = [ r'(\d+(?:\.\d+)?)\s*(cups?|tablespoons?|teaspoons?|pounds?|ounces?|grams?)\s+(?:of\s+)?([a-zA-Z\s]+)', r'([a-zA-Z\s]+)(?:\s*,\s*(\d+(?:\.\d+)?)\s*(cups?|tablespoons?|teaspoons?))?', ] text_lower = text.lower() # Extract ingredients with quantities for pattern in ingredient_patterns: matches = re.findall(pattern, text_lower) for match in matches: if len(match) >= 3: quantity, unit, ingredient = match[0], match[1], match[2] if ingredient.strip(): recipe_info['ingredients'].append({ 'ingredient': ingredient.strip(), 'quantity': quantity, 'unit': unit }) # Look for common cooking methods cooking_methods = ['bake', 'mix', 'stir', 'whip', 'fold', 'beat', 'combine', 'add', 'pour'] for method in cooking_methods: if method in text_lower: recipe_info['cooking_methods'].append(method) # Extract time references time_patterns = [ r'(\d+)\s*minutes?', r'(\d+)\s*hours?', r'(\d+)\s*degrees?' ] for pattern in time_patterns: matches = re.findall(pattern, text_lower) recipe_info['time_references'].extend(matches) return recipe_info def _extract_education_info(self, text: str) -> Dict[str, Any]: """Extract education-specific information from transcription.""" import re education_info = { 'page_numbers': [], 'chapter_numbers': [], 'exercise_numbers': [], 'subjects': [], 'assignments': [] } text_lower = text.lower() # Extract page numbers page_patterns = [ r'page\s+(\d+)', r'on\s+page\s+(\d+)', r'turn\s+to\s+page\s+(\d+)' ] for pattern in page_patterns: matches = re.findall(pattern, text_lower) education_info['page_numbers'].extend(matches) # Extract chapter numbers chapter_patterns = [ r'chapter\s+(\d+)', r'unit\s+(\d+)' ] for pattern in chapter_patterns: matches = re.findall(pattern, text_lower) education_info['chapter_numbers'].extend(matches) # Extract exercise/problem numbers exercise_patterns = [ r'exercise\s+(\d+)', r'problem\s+(\d+)', r'question\s+(\d+)' ] for pattern in exercise_patterns: matches = re.findall(pattern, text_lower) education_info['exercise_numbers'].extend(matches) # Identify subjects subjects = ['math', 'mathematics', 'science', 'history', 'english', 'literature', 'physics', 'chemistry', 'biology'] for subject in subjects: if subject in text_lower: education_info['subjects'].append(subject) return education_info def extract_specific_info(self, transcription: str, info_type: str) -> List[str]: """ Extract specific information from transcription. Args: transcription: The transcribed text info_type: Type of information to extract ('ingredients', 'page_numbers', 'numbers', etc.) Returns: List of extracted information """ import re if info_type == 'ingredients': # Extract ingredients from recipe transcription ingredients = [] text_lower = transcription.lower() # Common ingredient words ingredient_keywords = [ 'flour', 'sugar', 'butter', 'eggs', 'milk', 'cream', 'vanilla', 'strawberries', 'berries', 'fruit', 'salt', 'baking powder', 'cinnamon', 'nutmeg', 'lemon', 'orange', 'chocolate', 'nuts' ] for keyword in ingredient_keywords: if keyword in text_lower: # Try to extract with quantity pattern = rf'(\d+(?:\.\d+)?)\s*(?:cups?|tablespoons?|teaspoons?|pounds?|ounces?)?\s*(?:of\s+)?{keyword}' matches = re.findall(pattern, text_lower) if matches: ingredients.extend([f"{match} {keyword}" for match in matches]) else: ingredients.append(keyword) return list(set(ingredients)) # Remove duplicates elif info_type == 'page_numbers': # Extract page numbers patterns = [ r'page\s+(\d+)', r'on\s+page\s+(\d+)', r'turn\s+to\s+page\s+(\d+)', r'go\s+to\s+page\s+(\d+)' ] page_numbers = [] for pattern in patterns: matches = re.findall(pattern, transcription.lower()) page_numbers.extend(matches) return list(set(page_numbers)) # Remove duplicates elif info_type == 'numbers': # Extract all numbers numbers = re.findall(r'\b\d+(?:\.\d+)?\b', transcription) return numbers else: return [] def get_tool_functions(self) -> List[Dict[str, Any]]: """Get function definitions for AGNO integration.""" return [ { "name": "process_audio_file", "description": "Process audio file with speech-to-text transcription and content analysis", "parameters": { "type": "object", "properties": { "file_path": { "type": "string", "description": "Path to the audio file to process" }, "extract_content": { "type": "boolean", "description": "Whether to perform content analysis on transcription", "default": True } }, "required": ["file_path"] } }, { "name": "extract_specific_info", "description": "Extract specific information from audio transcription", "parameters": { "type": "object", "properties": { "transcription": { "type": "string", "description": "The transcribed text to analyze" }, "info_type": { "type": "string", "description": "Type of information to extract", "enum": ["ingredients", "page_numbers", "numbers"] } }, "required": ["transcription", "info_type"] } } ] # Create tool instance for AGNO integration def create_audio_processing_tool() -> Optional[AudioProcessingTool]: """Create and return audio processing tool instance.""" try: tool = AudioProcessingTool() if tool.available: logger.info("✅ Audio processing tool created successfully") return tool else: logger.warning("⚠️ Audio processing tool not available") return None except Exception as e: logger.error(f"❌ Failed to create audio processing tool: {e}") return None