eee_validator / instance_level_eval.schema.json
deepmage121's picture
initial commit, space validation stuff
92ea780
{ "$schema": "http://json-schema.org/draft-07/schema#",
"version": "instance_level_eval_0.2.0",
"type": "object",
"description": "Schema for storing instance-level evaluation data for LLM evaluations, supporting single-turn, multi-turn, and agentic interactions",
"required": [
"schema_version",
"evaluation_id",
"model_id",
"evaluation_name",
"sample_id",
"interaction_type",
"input",
"answer_attribution",
"evaluation"
],
"additionalProperties": true,
"properties": {
"schema_version": {
"type": "string",
"description": "Version of the schema used for this instance data"
},
"evaluation_id": {
"type": "string",
"description": "Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file."
},
"model_id": {
"type": "string",
"description": "Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)"
},
"evaluation_name": {
"type": "string",
"description": "The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)"
},
"sample_id": {
"type": ["integer", "string"],
"description": "Question/sample identifier from the original dataset (e.g. gsm8k_0001)"
},
"sample_hash": {
"type": "string",
"description": "Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent"
},
"interaction_type": {
"type": "string",
"enum": ["single_turn", "multi_turn", "agentic"],
"description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
},
"input": {
"type": "object",
"description": "Input data for the evaluation sample",
"required": ["raw", "reference"],
"properties": {
"raw": {
"type": "string",
"description": "The raw input as defined in the eval"
},
"formatted": {
"type": "string",
"description": "Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees"
},
"reference": {
"type": "string",
"description": "Ground truth or reference answer for comparison/scoring"
},
"choices": {
"type": "array",
"description": "Optional list of choices for multiple-choice questions",
"items": {
"type": "string"
}
}
}
},
"output": {
"type": ["object", "null"],
"description": "Output data - only used for single_turn interactions, null for multi_turn/agentic",
"required": ["raw"],
"properties": {
"raw": {
"type": "string",
"description": "Complete model response"
},
"reasoning_trace": {
"type": ["string", "null"],
"description": "Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)"
}
}
},
"interactions": {
"type": ["array", "null"],
"description": "List of interactions - used for multi_turn and agentic, null for single_turn",
"items": {
"type": "object",
"required": ["turn_idx", "role"],
"properties": {
"turn_idx": {
"type": "integer",
"minimum": 0,
"description": "Index starting from 0 indicating the position in the conversation"
},
"role": {
"type": "string",
"description": "Role of the speaker (e.g. user, assistant, system, tool)"
},
"content": {
"type": ["string", "null"],
"description": "The actual raw text for that particular turn (can be null if empty)"
},
"reasoning_trace": {
"type": ["string", "null"],
"description": "Reasoning trace for that particular turn if applicable"
},
"tool_calls": {
"type": ["array", "null"],
"description": "List of tool invocations for this turn, if applicable",
"items": {
"type": "object",
"required": ["id", "name"],
"properties": {
"id": {
"type": "string",
"description": "Unique identifier for the tool call"
},
"name": {
"type": "string",
"description": "Name of tool/function"
},
"arguments": {
"type": "object",
"description": "Arguments used to call the tool",
"additionalProperties": true
}
}
}
},
"tool_call_id": {
"oneOf": [
{
"type": "string",
"description": "Reference to the tool call ID this turn is responding to (for tool role responses)"
},
{
"type": "array",
"description": "Reference to the tool call ID(s) this message has the content payload for.",
"items": {
"type": "string"
}
}
]
}
}
}
},
"answer_attribution": {
"type": "array",
"description": "Information about how the answer was extracted from the model output",
"items": {
"type": "object",
"required": ["turn_idx", "source", "extracted_value", "extraction_method", "is_terminal"],
"properties": {
"turn_idx": {
"type": "integer",
"minimum": 0,
"description": "Turn index in interactions. 0 for single_turn"
},
"source": {
"type": "string",
"description": "Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')"
},
"extracted_value": {
"type": "string",
"description": "Value that was extracted"
},
"extraction_method": {
"type": "string",
"description": "Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)"
},
"is_terminal": {
"type": "boolean",
"description": "Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)"
}
}
}
},
"evaluation": {
"type": "object",
"description": "Evaluation results and scoring data",
"required": ["score", "is_correct"],
"properties": {
"score": {
"type": ["number", "boolean"],
"description": "Instance-level score"
},
"is_correct": {
"type": "boolean",
"description": "Whether the final answer is correct"
},
"num_turns": {
"type": "integer",
"minimum": 1,
"description": "Number of turns in the interaction"
},
"tool_calls_count": {
"type": "integer",
"minimum": 0,
"description": "Count of tool calls across all turns in interactions"
}
}
},
"token_usage": {
"type": ["object", "null"],
"description": "Token usage for the model completion",
"required": ["input_tokens", "output_tokens", "total_tokens"],
"properties": {
"input_tokens": {
"type": "integer",
"minimum": 0,
"description": "Total input tokens used"
},
"output_tokens": {
"type": "integer",
"minimum": 0,
"description": "Total output tokens used"
},
"total_tokens": {
"type": "integer",
"minimum": 0,
"description": "Total tokens used"
},
"input_tokens_cache_write": {
"type": ["integer", "null"],
"minimum": 0,
"description": "Number of tokens written to the cache"
},
"input_tokens_cache_read": {
"type": ["integer", "null"],
"minimum": 0,
"description": "Number of tokens retrieved from the cache"
},
"reasoning_tokens": {
"type": ["integer", "null"],
"minimum": 0,
"description": "Number of tokens used for reasoning"
}
}
},
"performance": {
"type": ["object", "null"],
"description": "Performance and latency metrics",
"properties": {
"latency_ms": {
"type": ["number", "null"],
"minimum": 0,
"description": "Total latency in milliseconds"
},
"time_to_first_token_ms": {
"type": ["number", "null"],
"minimum": 0,
"description": "Time to first token in milliseconds"
},
"generation_time_ms": {
"type": ["number", "null"],
"minimum": 0,
"description": "Time for generation in milliseconds"
}
},
"additionalProperties": true
},
"error": {
"type": ["string", "null"],
"description": "Information about any error that occurred (e.g. timeout, refusal, API error)"
},
"metadata": {
"type": "object",
"description": "Optional metadata about the sample (e.g. subject, difficulty, tags)",
"additionalProperties": true
}
},
"allOf": [
{
"if": {
"properties": {
"interaction_type": {
"const": "single_turn"
}
}
},
"then": {
"required": ["output"],
"properties": {
"output": {
"type": "object",
"not": {
"type": "null"
}
},
"interactions": {
"type": "null"
}
}
}
},
{
"if": {
"properties": {
"interaction_type": {
"enum": ["multi_turn", "agentic"]
}
}
},
"then": {
"required": ["interactions"],
"properties": {
"output": {
"type": "null"
},
"interactions": {
"type": "array",
"not": {
"type": "null"
}
},
"metrics": {
"required": ["num_turns"]
}
}
}
}
]
}