Spaces:

evaleval
/

eee_validator

Running

App Files Files Community

eee_validator / instance_level_eval.schema.json

deepmage121

initial commit, space validation stuff

92ea780 4 days ago

raw

history blame contribute delete

13.3 kB

	{ "$schema": "http://json-schema.org/draft-07/schema#",
	"version": "instance_level_eval_0.2.0",
	"type": "object",
	"description": "Schema for storing instance-level evaluation data for LLM evaluations, supporting single-turn, multi-turn, and agentic interactions",
	"required": [
	"schema_version",
	"evaluation_id",
	"model_id",
	"evaluation_name",
	"sample_id",
	"interaction_type",
	"input",
	"answer_attribution",
	"evaluation"
	],
	"additionalProperties": true,
	"properties": {
	"schema_version": {
	"type": "string",
	"description": "Version of the schema used for this instance data"
	},
	"evaluation_id": {
	"type": "string",
	"description": "Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file."
	},
	"model_id": {
	"type": "string",
	"description": "Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)"
	},
	"evaluation_name": {
	"type": "string",
	"description": "The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)"
	},
	"sample_id": {
	"type": ["integer", "string"],
	"description": "Question/sample identifier from the original dataset (e.g. gsm8k_0001)"
	},
	"sample_hash": {
	"type": "string",
	"description": "Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent"
	},
	"interaction_type": {
	"type": "string",
	"enum": ["single_turn", "multi_turn", "agentic"],
	"description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
	},
	"input": {
	"type": "object",
	"description": "Input data for the evaluation sample",
	"required": ["raw", "reference"],
	"properties": {
	"raw": {
	"type": "string",
	"description": "The raw input as defined in the eval"
	},
	"formatted": {
	"type": "string",
	"description": "Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees"
	},
	"reference": {
	"type": "string",
	"description": "Ground truth or reference answer for comparison/scoring"
	},
	"choices": {
	"type": "array",
	"description": "Optional list of choices for multiple-choice questions",
	"items": {
	"type": "string"
	}
	}
	}
	},
	"output": {
	"type": ["object", "null"],
	"description": "Output data - only used for single_turn interactions, null for multi_turn/agentic",
	"required": ["raw"],
	"properties": {
	"raw": {
	"type": "string",
	"description": "Complete model response"
	},
	"reasoning_trace": {
	"type": ["string", "null"],
	"description": "Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)"
	}
	}
	},
	"interactions": {
	"type": ["array", "null"],
	"description": "List of interactions - used for multi_turn and agentic, null for single_turn",
	"items": {
	"type": "object",
	"required": ["turn_idx", "role"],
	"properties": {
	"turn_idx": {
	"type": "integer",
	"minimum": 0,
	"description": "Index starting from 0 indicating the position in the conversation"
	},
	"role": {
	"type": "string",
	"description": "Role of the speaker (e.g. user, assistant, system, tool)"
	},
	"content": {
	"type": ["string", "null"],
	"description": "The actual raw text for that particular turn (can be null if empty)"
	},
	"reasoning_trace": {
	"type": ["string", "null"],
	"description": "Reasoning trace for that particular turn if applicable"
	},
	"tool_calls": {
	"type": ["array", "null"],
	"description": "List of tool invocations for this turn, if applicable",
	"items": {
	"type": "object",
	"required": ["id", "name"],
	"properties": {
	"id": {
	"type": "string",
	"description": "Unique identifier for the tool call"
	},
	"name": {
	"type": "string",
	"description": "Name of tool/function"
	},
	"arguments": {
	"type": "object",
	"description": "Arguments used to call the tool",
	"additionalProperties": true
	}
	}
	}
	},
	"tool_call_id": {
	"oneOf": [
	{
	"type": "string",
	"description": "Reference to the tool call ID this turn is responding to (for tool role responses)"
	},
	{
	"type": "array",
	"description": "Reference to the tool call ID(s) this message has the content payload for.",
	"items": {
	"type": "string"
	}
	}
	]
	}
	}
	}
	},
	"answer_attribution": {
	"type": "array",
	"description": "Information about how the answer was extracted from the model output",
	"items": {
	"type": "object",
	"required": ["turn_idx", "source", "extracted_value", "extraction_method", "is_terminal"],
	"properties": {
	"turn_idx": {
	"type": "integer",
	"minimum": 0,
	"description": "Turn index in interactions. 0 for single_turn"
	},
	"source": {
	"type": "string",
	"description": "Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')"
	},
	"extracted_value": {
	"type": "string",
	"description": "Value that was extracted"
	},
	"extraction_method": {
	"type": "string",
	"description": "Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)"
	},
	"is_terminal": {
	"type": "boolean",
	"description": "Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)"
	}
	}
	}
	},
	"evaluation": {
	"type": "object",
	"description": "Evaluation results and scoring data",
	"required": ["score", "is_correct"],
	"properties": {
	"score": {
	"type": ["number", "boolean"],
	"description": "Instance-level score"
	},
	"is_correct": {
	"type": "boolean",
	"description": "Whether the final answer is correct"
	},
	"num_turns": {
	"type": "integer",
	"minimum": 1,
	"description": "Number of turns in the interaction"
	},
	"tool_calls_count": {
	"type": "integer",
	"minimum": 0,
	"description": "Count of tool calls across all turns in interactions"
	}
	}
	},
	"token_usage": {
	"type": ["object", "null"],
	"description": "Token usage for the model completion",
	"required": ["input_tokens", "output_tokens", "total_tokens"],
	"properties": {
	"input_tokens": {
	"type": "integer",
	"minimum": 0,
	"description": "Total input tokens used"
	},
	"output_tokens": {
	"type": "integer",
	"minimum": 0,
	"description": "Total output tokens used"
	},
	"total_tokens": {
	"type": "integer",
	"minimum": 0,
	"description": "Total tokens used"
	},
	"input_tokens_cache_write": {
	"type": ["integer", "null"],
	"minimum": 0,
	"description": "Number of tokens written to the cache"
	},
	"input_tokens_cache_read": {
	"type": ["integer", "null"],
	"minimum": 0,
	"description": "Number of tokens retrieved from the cache"
	},
	"reasoning_tokens": {
	"type": ["integer", "null"],
	"minimum": 0,
	"description": "Number of tokens used for reasoning"
	}
	}
	},
	"performance": {
	"type": ["object", "null"],
	"description": "Performance and latency metrics",
	"properties": {
	"latency_ms": {
	"type": ["number", "null"],
	"minimum": 0,
	"description": "Total latency in milliseconds"
	},
	"time_to_first_token_ms": {
	"type": ["number", "null"],
	"minimum": 0,
	"description": "Time to first token in milliseconds"
	},
	"generation_time_ms": {
	"type": ["number", "null"],
	"minimum": 0,
	"description": "Time for generation in milliseconds"
	}
	},
	"additionalProperties": true
	},
	"error": {
	"type": ["string", "null"],
	"description": "Information about any error that occurred (e.g. timeout, refusal, API error)"
	},
	"metadata": {
	"type": "object",
	"description": "Optional metadata about the sample (e.g. subject, difficulty, tags)",
	"additionalProperties": true
	}
	},
	"allOf": [
	{
	"if": {
	"properties": {
	"interaction_type": {
	"const": "single_turn"
	}
	}
	},
	"then": {
	"required": ["output"],
	"properties": {
	"output": {
	"type": "object",
	"not": {
	"type": "null"
	}
	},
	"interactions": {
	"type": "null"
	}
	}
	}
	},
	{
	"if": {
	"properties": {
	"interaction_type": {
	"enum": ["multi_turn", "agentic"]
	}
	}
	},
	"then": {
	"required": ["interactions"],
	"properties": {
	"output": {
	"type": "null"
	},
	"interactions": {
	"type": "array",
	"not": {
	"type": "null"
	}
	},
	"metrics": {
	"required": ["num_turns"]
	}
	}
	}
	}
	]
	}