Spaces:
Sleeping
Sleeping
# coding: utf-8 | |
# Copyright (c) 2025 inclusionAI. | |
import abc | |
from dataclasses import dataclass, field | |
from typing import List, Optional, Dict, Any | |
from aworld.config.conf import EvaluationConfig | |
from aworld.core.context.base import Context | |
class EvaluationCriteria: | |
pass | |
class EvaluationResult: | |
task_id: str | |
metrics: Dict[str, Any] = field(default_factory=dict) | |
details: Dict[str, Any] = field(default_factory=dict) | |
score: float = 0. | |
passed: bool = False | |
error_message: Optional[str] = None | |
class Evaluator: | |
__metaclass__ = abc.ABCMeta | |
def __init__(self, | |
conf: EvaluationConfig, | |
dataset: object = None, | |
file_path: str = None, | |
context: Context = None, | |
results: List[str] = None, | |
ground_truth: List[str] = None): | |
self.conf = conf | |
self.context = context | |
self.dataset = dataset | |
self.file_path = file_path | |
self.results = results | |
self.ground_truth = ground_truth | |
self.eval_results = None | |
async def run(self): | |
"""The evaluation complete pipeline.""" | |
async def evaluate(self) -> EvaluationResult: | |
"""Evaluate the dataset/task. | |
Returns: | |
EvaluationResult | |
""" | |
await self.pre_evaluate() | |
results = await self.do_evaluate() | |
return await self.post_evaluate(results) | |
async def do_evaluate(self) -> EvaluationResult: | |
"""Implement specific evaluation process.""" | |
async def pre_evaluate(self) -> None: | |
"""Can be used to perform any setup before evaluation.""" | |
async def post_evaluate(self, evaluate_result: EvaluationResult) -> EvaluationResult: | |
"""Used to perform integration testing or clean up tasks after evaluation. | |
Args: | |
evaluate_result: The result of the evaluate dataset. | |
""" | |
return evaluate_result | |