Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

File size: 17,877 Bytes

9a6a4dc

"""
GAIA Agent - Simplified Working Version
Complete AGNO Tools with Basic Multimodal Integration

This agent provides comprehensive GAIA evaluation capabilities using:
- All AGNO tools (calculator, python, wikipedia, arxiv, firecrawl, exa, file, shell)
- Basic multimodal tools (Mistral Vision when available)
- Simple, reliable answer formatting
- No complex dependencies that cause import failures

Advantages:
- Single agent for all GAIA tasks (text, math, multimodal)
- AGNO's native orchestration handles tool selection
- Simple, reliable architecture that works in HuggingFace Space
- Consistent error handling and response formatting
- No complex import dependencies
"""

import os
import logging
from typing import Dict, Any, List, Optional
from pathlib import Path

from agno.agent import Agent
from agno.models.mistral import MistralChat

# Import European open-source multimodal tools
try:
    from .mistral_multimodal_agent import OpenSourceMultimodalTools
    MULTIMODAL_AVAILABLE = True
except ImportError:
    try:
        from mistral_multimodal_agent import OpenSourceMultimodalTools
        MULTIMODAL_AVAILABLE = True
    except ImportError:
        OpenSourceMultimodalTools = None
        MULTIMODAL_AVAILABLE = False

# Simple answer formatting without complex dependencies
class SimpleAnswerFormatter:
    """Simple answer formatter for GAIA evaluation."""
    
    def format_answer(self, response: str, question: str = None) -> str:
        """Format response for GAIA evaluation."""
        if not response:
            return ""
        
        # Clean the response
        answer = response.strip()
        
        # Remove common prefixes
        prefixes_to_remove = [
            "The answer is:",
            "Answer:",
            "Final answer:",
            "The final answer is:",
            "Based on my analysis,",
            "According to my research,",
        ]
        
        for prefix in prefixes_to_remove:
            if answer.lower().startswith(prefix.lower()):
                answer = answer[len(prefix):].strip()
        
        # Remove markdown formatting
        answer = answer.replace("**", "").replace("*", "")
        
        # Extract final answer if it's in a specific format
        lines = answer.split('\n')
        for line in lines:
            line = line.strip()
            if line and not line.startswith('#') and not line.startswith('-'):
                # This looks like a final answer
                return line
        
        return answer

# Load environment variables from .env file
def load_env_file():
    """Load environment variables from .env file if it exists."""
    env_file = Path('.env')
    if env_file.exists():
        with open(env_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and '=' in line:
                    key, value = line.split('=', 1)
                    os.environ[key.strip()] = value.strip()

# Load environment variables at module level
load_env_file()

logger = logging.getLogger(__name__)


class GAIAAgent:
    """
    GAIA Agent with comprehensive AGNO tools and basic multimodal capabilities.
    
    This agent combines all AGNO tools with basic multimodal processing,
    providing a single interface for all GAIA evaluation tasks including:
    - Text and mathematical reasoning
    - Basic image analysis using Mistral Vision
    - Web research and content extraction
    - Simple, reliable answer formatting
    """
    
    def __init__(self):
        """Initialize the unified AGNO agent."""
        logger.info("🚀 Initializing Unified AGNO Agent...")
        
        # Initialize simple answer formatter
        self.response_formatter = SimpleAnswerFormatter()
        
        # Initialize all AGNO tools
        self.tools = self._init_all_agno_tools()
        
        # Initialize European open-source multimodal tools
        self.multimodal_tools = self._init_multimodal_tools()
        if self.multimodal_tools:
            self.tools.extend(self.multimodal_tools.tools)
        
        # Check for required API key
        self.mistral_api_key = os.getenv("MISTRAL_API_KEY")
        if not self.mistral_api_key:
            logger.error("❌ MISTRAL_API_KEY not found - AGNO agent requires this for orchestration")
            self.agent = None
            self.available = False
            return
        
        # Create the unified AGNO agent
        self.agent = self._create_agno_agent()
        
        # Set availability flag
        self.available = self.agent is not None
        
        if self.available:
            logger.info("✅ Unified AGNO Agent initialized successfully")
            logger.info(f"📊 Available tools: {len(self.tools)}")
        else:
            logger.error("❌ Unified AGNO Agent initialization failed")
    
    def _init_all_agno_tools(self) -> List[Any]:
        """Initialize all available AGNO tools."""
        tools = []
        tool_status = {}
        
        # Define all AGNO tools with their requirements
        tools_config = [
            # Core computational tools
            {
                'name': 'calculator',
                'module': 'agno.tools.calculator',
                'class': 'CalculatorTools',
                'required_env': None,
                'description': 'Mathematical calculations and operations'
            },
            {
                'name': 'python',
                'module': 'agno.tools.python',
                'class': 'PythonTools',
                'required_env': None,
                'description': 'Python code execution and analysis'
            },
            
            # Knowledge and research tools
            {
                'name': 'wikipedia',
                'module': 'agno.tools.wikipedia',
                'class': 'WikipediaTools',
                'required_env': None,
                'description': 'Wikipedia knowledge retrieval'
            },
            {
                'name': 'arxiv',
                'module': 'agno.tools.arxiv',
                'class': 'ArxivTools',
                'required_env': None,
                'description': 'Academic research via ArXiv'
            },
            
            # Web tools
            {
                'name': 'firecrawl',
                'module': 'agno.tools.firecrawl',
                'class': 'FirecrawlTools',
                'required_env': 'FIRECRAWL_API_KEY',
                'description': 'Web content extraction'
            },
            {
                'name': 'exa',
                'module': 'agno.tools.exa',
                'class': 'ExaTools',
                'required_env': 'EXA_API_KEY',
                'description': 'Advanced web search'
            },
            
            # System tools
            {
                'name': 'file',
                'module': 'agno.tools.file',
                'class': 'FileTools',
                'required_env': None,
                'description': 'File operations and management'
            },
            {
                'name': 'shell',
                'module': 'agno.tools.shell',
                'class': 'ShellTools',
                'required_env': None,
                'description': 'System shell operations'
            },
            
            # Optional multimodal tools
            {
                'name': 'youtube',
                'module': 'agno.tools.youtube',
                'class': 'YouTubeTools',
                'required_env': None,
                'description': 'YouTube video transcription and analysis',
                'optional_deps': ['youtube_transcript_api']
            },
        ]
        
        for tool_config in tools_config:
            tool_name = tool_config['name']
            module_path = tool_config['module']
            class_name = tool_config['class']
            required_env = tool_config['required_env']
            description = tool_config['description']
            optional_deps = tool_config.get('optional_deps', [])
            
            try:
                # Check if required environment variable is available
                if required_env and not os.getenv(required_env):
                    logger.warning(f"⚠️ {required_env} not found, {tool_name} tool unavailable")
                    tool_status[tool_name] = f"Missing {required_env}"
                    continue
                
                # Import and instantiate the tool
                module = __import__(module_path, fromlist=[class_name])
                tool_class = getattr(module, class_name)
                
                # Initialize tool with appropriate parameters
                if tool_name == 'exa':
                    tool_instance = tool_class(api_key=os.getenv('EXA_API_KEY'))
                elif tool_name == 'firecrawl':
                    tool_instance = tool_class(api_key=os.getenv('FIRECRAWL_API_KEY'))
                else:
                    tool_instance = tool_class()
                
                tools.append(tool_instance)
                tool_status[tool_name] = "✅ Available"
                logger.info(f"✅ {class_name} initialized: {description}")
                
            except ImportError as e:
                if optional_deps and any(dep in str(e) for dep in optional_deps):
                    logger.warning(f"⚠️ {class_name} not available: missing optional dependency")
                    tool_status[tool_name] = f"Missing optional dependency"
                else:
                    logger.warning(f"⚠️ {class_name} not available: {e}")
                    tool_status[tool_name] = f"Import error: {str(e)[:50]}"
            except Exception as e:
                logger.warning(f"⚠️ {class_name} not available: {e}")
                tool_status[tool_name] = f"Error: {str(e)[:50]}"
        
        # Log tool availability summary
        logger.info("📊 AGNO Tools Status:")
        for tool_name, status in tool_status.items():
            logger.info(f"  {tool_name}: {status}")
        
        return tools
    
    def _init_multimodal_tools(self) -> Optional[Any]:
        """Initialize European open-source multimodal tools."""
        if not MULTIMODAL_AVAILABLE:
            logger.warning("⚠️ European open-source multimodal tools not available")
            return None
        
        try:
            multimodal_tools = OpenSourceMultimodalTools()
            logger.info("✅ European open-source multimodal tools initialized")
            logger.info("🇪🇺 Features: Image analysis (BLIP-2/Mistral Vision), Audio transcription (Faster-Whisper), Document analysis")
            return multimodal_tools
        except Exception as e:
            logger.warning(f"⚠️ Failed to initialize multimodal tools: {e}")
            return None
    
    def _create_agno_agent(self) -> Optional[Agent]:
        """Create the unified AGNO agent with all available tools."""
        if not self.tools:
            logger.warning("⚠️ No AGNO tools available, creating agent without tools")
        
        try:
            # Create Mistral model for the agent
            model = MistralChat(
                api_key=self.mistral_api_key,
                id="mistral-large-latest",  # Use latest large model for better function calling
                temperature=0.1,  # Low temperature for factual accuracy
                max_tokens=2000
            )
            
            # Create the unified agent with all available tools
            agent = Agent(
                model=model,
                tools=self.tools,
                instructions=self._get_agent_instructions(),
                show_tool_calls=True,  # Enable tool call visibility for debugging
                markdown=True,
                debug_mode=True  # Enable debug mode to see tool usage
            )
            
            logger.info(f"✅ Unified AGNO Agent created with {len(self.tools)} tools")
            return agent
            
        except Exception as e:
            logger.error(f"❌ Failed to create AGNO agent: {e}")
            return None
    
    def _get_agent_instructions(self) -> str:
        """Get comprehensive instructions for the unified AGNO agent."""
        return """You are a GAIA evaluation agent with access to comprehensive AGNO tools.

CRITICAL GAIA EVALUATION REQUIREMENTS:
1. EXACT ANSWER MATCHING: Your final answer must match the expected answer EXACTLY
2. NO EXPLANATIONS: Provide only the final answer, no reasoning or explanations
3. PRECISE FORMAT: Follow the exact format expected (number, text, etc.)
4. FACTUAL ACCURACY: Use tools to verify all information before answering

AVAILABLE TOOLS AND WHEN TO USE THEM:

CORE COMPUTATIONAL TOOLS:
1. CALCULATOR TOOLS - Use for:
   - Mathematical calculations and operations
   - Unit conversions and numerical computations
   - Complex mathematical expressions

2. PYTHON TOOLS - Use for:
   - Code execution and analysis
   - Data processing and calculations
   - Algorithm implementation

KNOWLEDGE AND RESEARCH TOOLS:
3. WIKIPEDIA TOOLS - Use ONLY when:
   - Wikipedia is explicitly mentioned in the question
   - Question specifically asks about Wikipedia content
   - Question references "according to Wikipedia" or similar

4. ARXIV TOOLS - Use for:
   - Academic research and scientific papers
   - Technical and research-oriented questions
   - Latest scientific developments

WEB RESEARCH TOOLS:
5. EXA TOOLS - Use for:
   - General web search and research
   - Finding current information and recent developments
   - Biographical information and general knowledge queries
   - Any web-based fact-checking and information gathering

6. FIRECRAWL TOOLS - Use for:
   - Web content extraction from specific URLs provided in the question
   - Detailed webpage analysis when URL is given
   - Content scraping when specific URLs need to be processed

SYSTEM TOOLS:
7. FILE TOOLS - Use for:
   - File operations and management
   - Reading and processing local files
   - File system operations

8. SHELL TOOLS - Use for:
   - System operations and commands
   - Environment queries
   - System-level information gathering

9. YOUTUBE TOOLS - Use for:
   - YouTube video transcription
   - Video content analysis via transcripts
   - Understanding video content without watching

MULTIMODAL TOOLS (European Open-Source):
10. IMAGE ANALYSIS - Use for:
    - Analyzing images using BLIP-2 or Mistral Vision
    - Answering questions about image content
    - Visual reasoning and description

11. AUDIO TRANSCRIPTION - Use for:
    - Transcribing audio files using Faster-Whisper (European community-driven)
    - Converting speech to text for analysis
    - Processing audio content

12. DOCUMENT ANALYSIS - Use for:
    - Analyzing document content and answering questions
    - Text-based document processing
    - Document question-answering using DistilBERT

GENERAL STRATEGY:
1. Analyze the question to determine the most appropriate tool(s)
2. Use tools systematically to gather accurate information
3. Synthesize findings into a precise, compliant answer
4. Always prioritize accuracy and factual correctness
5. Use multiple tools if needed for verification

ANSWER FORMAT:
- Provide ONLY the final answer
- No explanations, reasoning, or additional text
- Match the expected format exactly (number, text, date, etc.)
- Ensure factual accuracy through tool verification"""
    
    def __call__(self, question: str) -> str:
        """Process a question using the unified AGNO agent."""
        if not self.available:
            logger.error("❌ Unified AGNO Agent not available - check MISTRAL_API_KEY")
            return "Agent not available"
        
        try:
            logger.info(f"🤔 Processing question with Unified AGNO Agent: {question[:100]}...")
            
            # Use AGNO agent to process the question with full orchestration
            response = self.agent.run(question)
            
            # Extract the response content
            if hasattr(response, 'content'):
                raw_answer = response.content
            elif isinstance(response, str):
                raw_answer = response
            else:
                raw_answer = str(response)
            
            # Format the response for GAIA evaluation
            formatted_answer = self.response_formatter.format_answer(raw_answer, question)
            
            logger.info(f"✅ Question processed successfully")
            logger.info(f"📝 Raw answer: {raw_answer[:200]}...")
            logger.info(f"🎯 Formatted answer: {formatted_answer}")
            
            return formatted_answer
            
        except Exception as e:
            logger.error(f"❌ Error processing question: {e}")
            return f"Error: {str(e)}"
    
    def get_tool_status(self) -> Dict[str, Any]:
        """Get the current status of all tools."""
        multimodal_status = {}
        if hasattr(self, 'multimodal_tools') and self.multimodal_tools:
            multimodal_status = self.multimodal_tools.get_capabilities_status()
        
        return {
            'available': self.available,
            'tools_count': len(self.tools) if self.tools else 0,
            'mistral_api_key_present': bool(self.mistral_api_key),
            'agent_created': self.agent is not None,
            'multimodal_tools_available': MULTIMODAL_AVAILABLE,
            'multimodal_status': multimodal_status
        }


# Create global agent instance
gaia_agent = GAIAAgent()


def process_question(question: str) -> str:
    """Process a question using the GAIA agent."""
    return gaia_agent(question)


def get_agent_status() -> Dict[str, Any]:
    """Get the current status of the GAIA agent."""
    return gaia_agent.get_tool_status()