import json import argparse import os # TEST WITH # python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json def load_json(filepath): """Loads JSON data from a file.""" try: with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) except FileNotFoundError: print(f"Error: File not found at {filepath}") return None except json.JSONDecodeError: print(f"Error: Could not decode JSON from {filepath}") return None def evaluate_answers(questions_data, agent_answers_data, level_filter=None): """ Evaluates agent answers against ground truth. Args: questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'. agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'. level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None. Returns: tuple: (accuracy, correct_count, total_evaluated, incorrect_details) incorrect_details is a list of tuples: (task_id, expected, got) """ correct_count = 0 total_evaluated = 0 incorrect_details = [] agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data} for task_id, question_info in questions_data.items(): # Apply level filter if specified if level_filter is not None and question_info.get('Level') != level_filter: continue if task_id in agent_answers_map: total_evaluated += 1 expected_answer = question_info.get('Final Answer') submitted_answer = agent_answers_map[task_id] # GAIA uses exact match if str(submitted_answer) == str(expected_answer): correct_count += 1 else: incorrect_details.append((task_id, expected_answer, submitted_answer)) # else: # print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0 return accuracy, correct_count, total_evaluated, incorrect_details def main(): parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.") parser.add_argument( "--questions_file", type=str, default="../question_set/new_gaia_questions.json", # Adjusted default path help="Path to the JSON file containing GAIA questions and answers." ) parser.add_argument( "--answers_file", type=str, required=True, help="Path to the JSON file containing the agent's submitted answers." ) parser.add_argument( "--level", type=int, choices=[1, 2, 3], default=None, # Default is None, meaning evaluate all levels help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified." ) parser.add_argument( "--verbose", action='store_true', # Add verbose flag help="Print details of incorrect answers." ) args = parser.parse_args() # Construct absolute paths relative to the script location script_dir = os.path.dirname(__file__) questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file)) answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities print(f"Loading questions from: {questions_filepath}") questions_data = load_json(questions_filepath) if questions_data is None: return print(f"Loading agent answers from: {answers_filepath}") agent_answers_data = load_json(answers_filepath) if agent_answers_data is None: return # Ensure agent_answers_data is a list if not isinstance(agent_answers_data, list): print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.") # Attempt to load if it's a dict containing a list (common mistake) if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list): agent_answers_data = agent_answers_data['answers'] print("Note: Loaded answers from the 'answers' key in the JSON object.") else: return level_str = f"Level {args.level}" if args.level else "All Levels" print(f"\nEvaluating answers for: {level_str}") accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers( questions_data, agent_answers_data, args.level ) if total_evaluated == 0: print("No answers found for the specified criteria.") else: print("\n--- Evaluation Results ---") print(f"Level Filter: {level_str}") print(f"Total Questions Evaluated: {total_evaluated}") print(f"Correct Answers: {correct_count}") print(f"Accuracy: {accuracy:.2f}%") if args.verbose and incorrect_details: print("\n--- Incorrect Answers ---") for task_id, expected, got in incorrect_details: print(f" Task ID: {task_id}") print(f" Expected: {expected}") print(f" Got: {got}") print("------------------------") if __name__ == "__main__": main()