{ "$schema": "http://json-schema.org/draft-07/schema#", "version": "instance_level_eval_0.2.0", "type": "object", "description": "Schema for storing instance-level evaluation data for LLM evaluations, supporting single-turn, multi-turn, and agentic interactions", "required": [ "schema_version", "evaluation_id", "model_id", "evaluation_name", "sample_id", "interaction_type", "input", "answer_attribution", "evaluation" ], "additionalProperties": true, "properties": { "schema_version": { "type": "string", "description": "Version of the schema used for this instance data" }, "evaluation_id": { "type": "string", "description": "Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file." }, "model_id": { "type": "string", "description": "Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)" }, "evaluation_name": { "type": "string", "description": "The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)" }, "sample_id": { "type": ["integer", "string"], "description": "Question/sample identifier from the original dataset (e.g. gsm8k_0001)" }, "sample_hash": { "type": "string", "description": "Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent" }, "interaction_type": { "type": "string", "enum": ["single_turn", "multi_turn", "agentic"], "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents" }, "input": { "type": "object", "description": "Input data for the evaluation sample", "required": ["raw", "reference"], "properties": { "raw": { "type": "string", "description": "The raw input as defined in the eval" }, "formatted": { "type": "string", "description": "Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees" }, "reference": { "type": "string", "description": "Ground truth or reference answer for comparison/scoring" }, "choices": { "type": "array", "description": "Optional list of choices for multiple-choice questions", "items": { "type": "string" } } } }, "output": { "type": ["object", "null"], "description": "Output data - only used for single_turn interactions, null for multi_turn/agentic", "required": ["raw"], "properties": { "raw": { "type": "string", "description": "Complete model response" }, "reasoning_trace": { "type": ["string", "null"], "description": "Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)" } } }, "interactions": { "type": ["array", "null"], "description": "List of interactions - used for multi_turn and agentic, null for single_turn", "items": { "type": "object", "required": ["turn_idx", "role"], "properties": { "turn_idx": { "type": "integer", "minimum": 0, "description": "Index starting from 0 indicating the position in the conversation" }, "role": { "type": "string", "description": "Role of the speaker (e.g. user, assistant, system, tool)" }, "content": { "type": ["string", "null"], "description": "The actual raw text for that particular turn (can be null if empty)" }, "reasoning_trace": { "type": ["string", "null"], "description": "Reasoning trace for that particular turn if applicable" }, "tool_calls": { "type": ["array", "null"], "description": "List of tool invocations for this turn, if applicable", "items": { "type": "object", "required": ["id", "name"], "properties": { "id": { "type": "string", "description": "Unique identifier for the tool call" }, "name": { "type": "string", "description": "Name of tool/function" }, "arguments": { "type": "object", "description": "Arguments used to call the tool", "additionalProperties": true } } } }, "tool_call_id": { "oneOf": [ { "type": "string", "description": "Reference to the tool call ID this turn is responding to (for tool role responses)" }, { "type": "array", "description": "Reference to the tool call ID(s) this message has the content payload for.", "items": { "type": "string" } } ] } } } }, "answer_attribution": { "type": "array", "description": "Information about how the answer was extracted from the model output", "items": { "type": "object", "required": ["turn_idx", "source", "extracted_value", "extraction_method", "is_terminal"], "properties": { "turn_idx": { "type": "integer", "minimum": 0, "description": "Turn index in interactions. 0 for single_turn" }, "source": { "type": "string", "description": "Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')" }, "extracted_value": { "type": "string", "description": "Value that was extracted" }, "extraction_method": { "type": "string", "description": "Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)" }, "is_terminal": { "type": "boolean", "description": "Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)" } } } }, "evaluation": { "type": "object", "description": "Evaluation results and scoring data", "required": ["score", "is_correct"], "properties": { "score": { "type": ["number", "boolean"], "description": "Instance-level score" }, "is_correct": { "type": "boolean", "description": "Whether the final answer is correct" }, "num_turns": { "type": "integer", "minimum": 1, "description": "Number of turns in the interaction" }, "tool_calls_count": { "type": "integer", "minimum": 0, "description": "Count of tool calls across all turns in interactions" } } }, "token_usage": { "type": ["object", "null"], "description": "Token usage for the model completion", "required": ["input_tokens", "output_tokens", "total_tokens"], "properties": { "input_tokens": { "type": "integer", "minimum": 0, "description": "Total input tokens used" }, "output_tokens": { "type": "integer", "minimum": 0, "description": "Total output tokens used" }, "total_tokens": { "type": "integer", "minimum": 0, "description": "Total tokens used" }, "input_tokens_cache_write": { "type": ["integer", "null"], "minimum": 0, "description": "Number of tokens written to the cache" }, "input_tokens_cache_read": { "type": ["integer", "null"], "minimum": 0, "description": "Number of tokens retrieved from the cache" }, "reasoning_tokens": { "type": ["integer", "null"], "minimum": 0, "description": "Number of tokens used for reasoning" } } }, "performance": { "type": ["object", "null"], "description": "Performance and latency metrics", "properties": { "latency_ms": { "type": ["number", "null"], "minimum": 0, "description": "Total latency in milliseconds" }, "time_to_first_token_ms": { "type": ["number", "null"], "minimum": 0, "description": "Time to first token in milliseconds" }, "generation_time_ms": { "type": ["number", "null"], "minimum": 0, "description": "Time for generation in milliseconds" } }, "additionalProperties": true }, "error": { "type": ["string", "null"], "description": "Information about any error that occurred (e.g. timeout, refusal, API error)" }, "metadata": { "type": "object", "description": "Optional metadata about the sample (e.g. subject, difficulty, tags)", "additionalProperties": true } }, "allOf": [ { "if": { "properties": { "interaction_type": { "const": "single_turn" } } }, "then": { "required": ["output"], "properties": { "output": { "type": "object", "not": { "type": "null" } }, "interactions": { "type": "null" } } } }, { "if": { "properties": { "interaction_type": { "enum": ["multi_turn", "agentic"] } } }, "then": { "required": ["interactions"], "properties": { "output": { "type": "null" }, "interactions": { "type": "array", "not": { "type": "null" } }, "metrics": { "required": ["num_turns"] } } } } ] }