""" GAIA Answer Formatter This module handles reformatting of agent responses to meet the GAIA benchmark requirements. It removes prefixes, citations, and other metadata to provide direct, concise answers. """ import re import logging from typing import Optional, Union, Dict, Any, List # Set up logging logger = logging.getLogger("gaia_agent.answer_formatter") def format_answer(answer: str) -> str: """ Format an answer according to GAIA benchmark requirements. Removes: - "Based on my search..." prefixes - Question repetition - Citation information - Other metadata Args: answer: The original answer text Returns: str: A clean, direct answer """ if not answer: return "" # Log original answer for debugging logger.debug(f"Original answer: {answer}") # Remove "Based on my search" prefixes answer = re.sub(r'^Based on my search[,:]?\s*', '', answer, flags=re.IGNORECASE) # Remove "Here's what I found about..." prefixes answer = re.sub(r'^Here\'s what I found about [\'"].*?[\'"]:?\s*', '', answer, flags=re.IGNORECASE) # Remove question repetition patterns answer = re.sub(r'^(Regarding|About|Concerning|On) [\'"].*?[\'"]:?\s*', '', answer, flags=re.IGNORECASE) answer = re.sub(r'^You asked about [\'"].*?[\'"]:?\s*', '', answer, flags=re.IGNORECASE) # Remove citation information answer = re.sub(r'\n\nThis information comes from.*$', '', answer, flags=re.DOTALL) answer = re.sub(r'\n\nThis information is compiled from multiple sources.*$', '', answer, flags=re.DOTALL) answer = re.sub(r'\n\nSource:.*$', '', answer, flags=re.DOTALL) # Remove additional metadata sections answer = re.sub(r'\n\nAdditionally:.*$', '', answer, flags=re.DOTALL) # Clean up any remaining citation markers answer = re.sub(r'\[\d+\]', '', answer) # Trim whitespace answer = answer.strip() # Log the formatted answer logger.debug(f"Formatted answer: {answer}") return answer def format_numerical_answer(answer: str) -> str: """ Format a numerical answer to extract just the number. Args: answer: The original answer text Returns: str: Just the numerical value if one can be extracted, otherwise the formatted answer """ # First apply general formatting cleaned_answer = format_answer(answer) # Extract numerical values numerical_match = re.search(r'(\d+(?:,\d+)*(?:\.\d+)?)', cleaned_answer) if numerical_match: return numerical_match.group(1) return cleaned_answer def format_list_answer(answer: str) -> str: """ Format a list-type answer to maintain the list structure but remove unnecessary text. Args: answer: The original answer text Returns: str: A cleaned list answer """ # First apply general formatting cleaned_answer = format_answer(answer) # If the answer contains numbered or bulleted items, preserve the list structure if re.search(r'(\d+\.\s+|\*\s+|•\s+|-\s+)', cleaned_answer): # Extract the list items but remove any preamble list_items = re.findall(r'(?:\d+\.\s+|\*\s+|•\s+|-\s+)(.+?)(?=\n\n|\n(?:\d+\.\s+|\*\s+|•\s+|-\s+)|$)', cleaned_answer, re.DOTALL) if list_items: return '\n'.join([f"- {item.strip()}" for item in list_items]) return cleaned_answer def detect_answer_type(question: str) -> str: """ Detect the type of answer expected based on the question. Args: question: The question text Returns: str: The detected answer type ('numerical', 'list', or 'text') """ question_lower = question.lower() # Check for numerical questions if re.search(r'how many|how much|count|number of|total of|population|percentage|age|height|weight|distance|length|width|depth|area|volume', question_lower): return 'numerical' # Check for list questions if re.search(r'list|name|enumerate|what are the|examples of', question_lower): return 'list' # Default to text return 'text' def format_answer_by_type(answer: str, question: Optional[str] = None) -> str: """ Format an answer according to the detected answer type from the question. Args: answer: The original answer text question: The original question (optional) Returns: str: A formatted answer appropriate for the question type """ if not question: return format_answer(answer) answer_type = detect_answer_type(question) if answer_type == 'numerical': return format_numerical_answer(answer) elif answer_type == 'list': return format_list_answer(answer) else: return format_answer(answer)