Spaces:

evaleval
/

eee_validator

Running

File size: 13,319 Bytes

92ea780

{   "$schema": "http://json-schema.org/draft-07/schema#",
    "version": "instance_level_eval_0.2.0",
    "type": "object",
    "description": "Schema for storing instance-level evaluation data for LLM evaluations, supporting single-turn, multi-turn, and agentic interactions",
    "required": [
        "schema_version",
        "evaluation_id",
        "model_id",
        "evaluation_name",
        "sample_id",
        "interaction_type",
        "input",
        "answer_attribution",
        "evaluation"
    ],
    "additionalProperties": true,
    "properties": {
        "schema_version": {
            "type": "string",
            "description": "Version of the schema used for this instance data"
        },
        "evaluation_id": {
            "type": "string",
            "description": "Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file."
        },
        "model_id": {
            "type": "string",
            "description": "Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)"
        },
        "evaluation_name": {
            "type": "string",
            "description": "The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)"
        },
        "sample_id": {
            "type": ["integer", "string"],
            "description": "Question/sample identifier from the original dataset (e.g. gsm8k_0001)"
        },
        "sample_hash": {
            "type": "string",
            "description": "Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent"
        },
        "interaction_type": {
            "type": "string",
            "enum": ["single_turn", "multi_turn", "agentic"],
            "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
        },
        "input": {
            "type": "object",
            "description": "Input data for the evaluation sample",
            "required": ["raw", "reference"],
            "properties": {
                "raw": {
                    "type": "string",
                    "description": "The raw input as defined in the eval"
                },
                "formatted": {
                    "type": "string",
                    "description": "Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees"
                },
                "reference": {
                    "type": "string",
                    "description": "Ground truth or reference answer for comparison/scoring"
                },
                "choices": {
                    "type": "array",
                    "description": "Optional list of choices for multiple-choice questions",
                    "items": {
                        "type": "string"
                    }
                }
            }
        },
        "output": {
            "type": ["object", "null"],
            "description": "Output data - only used for single_turn interactions, null for multi_turn/agentic",
            "required": ["raw"],
            "properties": {
                "raw": {
                    "type": "string",
                    "description": "Complete model response"
                },
                "reasoning_trace": {
                    "type": ["string", "null"],
                    "description": "Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)"
                }
            }
        },
        "interactions": {
            "type": ["array", "null"],
            "description": "List of interactions - used for multi_turn and agentic, null for single_turn",
            "items": {
                "type": "object",
                "required": ["turn_idx", "role"],
                "properties": {
                    "turn_idx": {
                        "type": "integer",
                        "minimum": 0,
                        "description": "Index starting from 0 indicating the position in the conversation"
                    },
                    "role": {
                        "type": "string",
                        "description": "Role of the speaker (e.g. user, assistant, system, tool)"
                    },
                    "content": {
                        "type": ["string", "null"],
                        "description": "The actual raw text for that particular turn (can be null if empty)"
                    },
                    "reasoning_trace": {
                        "type": ["string", "null"],
                        "description": "Reasoning trace for that particular turn if applicable"
                    },
                    "tool_calls": {
                        "type": ["array", "null"],
                        "description": "List of tool invocations for this turn, if applicable",
                        "items": {
                            "type": "object",
                            "required": ["id", "name"],
                            "properties": {
                                "id": {
                                    "type": "string",
                                    "description": "Unique identifier for the tool call"
                                },
                                "name": {
                                    "type": "string",
                                    "description": "Name of tool/function"
                                },
                                "arguments": {
                                    "type": "object",
                                    "description": "Arguments used to call the tool",
                                    "additionalProperties": true
                                }
                            }
                        }
                    },
                    "tool_call_id": {
                        "oneOf": [
                            {
                                "type": "string",
                                "description": "Reference to the tool call ID this turn is responding to (for tool role responses)"
                            },
                            {
                                "type": "array",
                                "description": "Reference to the tool call ID(s) this message has the content payload for.",
                                "items": {
                                    "type": "string"
                                }
                            }
                        ]
                    }
                }
            }
        },
        "answer_attribution": {
            "type": "array",
            "description": "Information about how the answer was extracted from the model output",
            "items": {
                "type": "object",
                "required": ["turn_idx", "source", "extracted_value", "extraction_method", "is_terminal"],
                "properties": {
                    "turn_idx": {
                        "type": "integer",
                        "minimum": 0,
                        "description": "Turn index in interactions. 0 for single_turn"
                    },
                    "source": {
                        "type": "string",
                        "description": "Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')"
                    },
                    "extracted_value": {
                        "type": "string",
                        "description": "Value that was extracted"
                    },
                    "extraction_method": {
                        "type": "string",
                        "description": "Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)"
                    },
                    "is_terminal": {
                        "type": "boolean",
                        "description": "Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)"
                    }
                }
            }
        },
        "evaluation": {
            "type": "object",
            "description": "Evaluation results and scoring data",
            "required": ["score", "is_correct"],
            "properties": {
                "score": {
                    "type": ["number", "boolean"],
                    "description": "Instance-level score"
                },
                "is_correct": {
                    "type": "boolean",
                    "description": "Whether the final answer is correct"
                },
                "num_turns": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "Number of turns in the interaction"
                },
                "tool_calls_count": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "Count of tool calls across all turns in interactions"
                }
            }
        },
        "token_usage": {
            "type": ["object", "null"],
            "description": "Token usage for the model completion",
            "required": ["input_tokens", "output_tokens", "total_tokens"],
            "properties": {
                "input_tokens": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "Total input tokens used"
                },
                "output_tokens": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "Total output tokens used"
                },
                "total_tokens": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "Total tokens used"
                },
                "input_tokens_cache_write": {
                    "type": ["integer", "null"],
                    "minimum": 0,
                    "description": "Number of tokens written to the cache"
                },
                "input_tokens_cache_read": {
                    "type": ["integer", "null"],
                    "minimum": 0,
                    "description": "Number of tokens retrieved from the cache"
                },
                "reasoning_tokens": {
                    "type": ["integer", "null"],
                    "minimum": 0,
                    "description": "Number of tokens used for reasoning"
                }
            }
        },
        "performance": {
            "type": ["object", "null"],
            "description": "Performance and latency metrics",
            "properties": {
                "latency_ms": {
                    "type": ["number", "null"],
                    "minimum": 0,
                    "description": "Total latency in milliseconds"
                },
                "time_to_first_token_ms": {
                    "type": ["number", "null"],
                    "minimum": 0,
                    "description": "Time to first token in milliseconds"
                },
                "generation_time_ms": {
                    "type": ["number", "null"],
                    "minimum": 0,
                    "description": "Time for generation in milliseconds"
                }
            },
            "additionalProperties": true
        },
        "error": {
            "type": ["string", "null"],
            "description": "Information about any error that occurred (e.g. timeout, refusal, API error)"
        },
        "metadata": {
            "type": "object",
            "description": "Optional metadata about the sample (e.g. subject, difficulty, tags)",
            "additionalProperties": true
        }
    },
    "allOf": [
        {
            "if": {
                "properties": {
                    "interaction_type": {
                        "const": "single_turn"
                    }
                }
            },
            "then": {
                "required": ["output"],
                "properties": {
                    "output": {
                        "type": "object",
                        "not": {
                            "type": "null"
                        }
                    },
                    "interactions": {
                        "type": "null"
                    }
                }
            }
        },
        {
            "if": {
                "properties": {
                    "interaction_type": {
                        "enum": ["multi_turn", "agentic"]
                    }
                }
            },
            "then": {
                "required": ["interactions"],
                "properties": {
                    "output": {
                        "type": "null"
                    },
                    "interactions": {
                        "type": "array",
                        "not": {
                            "type": "null"
                        }
                    },
                    "metrics": {
                        "required": ["num_turns"]
                    }
                }
            }
        }
    ]
}