File size: 13,319 Bytes
92ea780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
{   "$schema": "http://json-schema.org/draft-07/schema#",
    "version": "instance_level_eval_0.2.0",
    "type": "object",
    "description": "Schema for storing instance-level evaluation data for LLM evaluations, supporting single-turn, multi-turn, and agentic interactions",
    "required": [
        "schema_version",
        "evaluation_id",
        "model_id",
        "evaluation_name",
        "sample_id",
        "interaction_type",
        "input",
        "answer_attribution",
        "evaluation"
    ],
    "additionalProperties": true,
    "properties": {
        "schema_version": {
            "type": "string",
            "description": "Version of the schema used for this instance data"
        },
        "evaluation_id": {
            "type": "string",
            "description": "Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file."
        },
        "model_id": {
            "type": "string",
            "description": "Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)"
        },
        "evaluation_name": {
            "type": "string",
            "description": "The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)"
        },
        "sample_id": {
            "type": ["integer", "string"],
            "description": "Question/sample identifier from the original dataset (e.g. gsm8k_0001)"
        },
        "sample_hash": {
            "type": "string",
            "description": "Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent"
        },
        "interaction_type": {
            "type": "string",
            "enum": ["single_turn", "multi_turn", "agentic"],
            "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
        },
        "input": {
            "type": "object",
            "description": "Input data for the evaluation sample",
            "required": ["raw", "reference"],
            "properties": {
                "raw": {
                    "type": "string",
                    "description": "The raw input as defined in the eval"
                },
                "formatted": {
                    "type": "string",
                    "description": "Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees"
                },
                "reference": {
                    "type": "string",
                    "description": "Ground truth or reference answer for comparison/scoring"
                },
                "choices": {
                    "type": "array",
                    "description": "Optional list of choices for multiple-choice questions",
                    "items": {
                        "type": "string"
                    }
                }
            }
        },
        "output": {
            "type": ["object", "null"],
            "description": "Output data - only used for single_turn interactions, null for multi_turn/agentic",
            "required": ["raw"],
            "properties": {
                "raw": {
                    "type": "string",
                    "description": "Complete model response"
                },
                "reasoning_trace": {
                    "type": ["string", "null"],
                    "description": "Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)"
                }
            }
        },
        "interactions": {
            "type": ["array", "null"],
            "description": "List of interactions - used for multi_turn and agentic, null for single_turn",
            "items": {
                "type": "object",
                "required": ["turn_idx", "role"],
                "properties": {
                    "turn_idx": {
                        "type": "integer",
                        "minimum": 0,
                        "description": "Index starting from 0 indicating the position in the conversation"
                    },
                    "role": {
                        "type": "string",
                        "description": "Role of the speaker (e.g. user, assistant, system, tool)"
                    },
                    "content": {
                        "type": ["string", "null"],
                        "description": "The actual raw text for that particular turn (can be null if empty)"
                    },
                    "reasoning_trace": {
                        "type": ["string", "null"],
                        "description": "Reasoning trace for that particular turn if applicable"
                    },
                    "tool_calls": {
                        "type": ["array", "null"],
                        "description": "List of tool invocations for this turn, if applicable",
                        "items": {
                            "type": "object",
                            "required": ["id", "name"],
                            "properties": {
                                "id": {
                                    "type": "string",
                                    "description": "Unique identifier for the tool call"
                                },
                                "name": {
                                    "type": "string",
                                    "description": "Name of tool/function"
                                },
                                "arguments": {
                                    "type": "object",
                                    "description": "Arguments used to call the tool",
                                    "additionalProperties": true
                                }
                            }
                        }
                    },
                    "tool_call_id": {
                        "oneOf": [
                            {
                                "type": "string",
                                "description": "Reference to the tool call ID this turn is responding to (for tool role responses)"
                            },
                            {
                                "type": "array",
                                "description": "Reference to the tool call ID(s) this message has the content payload for.",
                                "items": {
                                    "type": "string"
                                }
                            }
                        ]
                    }
                }
            }
        },
        "answer_attribution": {
            "type": "array",
            "description": "Information about how the answer was extracted from the model output",
            "items": {
                "type": "object",
                "required": ["turn_idx", "source", "extracted_value", "extraction_method", "is_terminal"],
                "properties": {
                    "turn_idx": {
                        "type": "integer",
                        "minimum": 0,
                        "description": "Turn index in interactions. 0 for single_turn"
                    },
                    "source": {
                        "type": "string",
                        "description": "Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')"
                    },
                    "extracted_value": {
                        "type": "string",
                        "description": "Value that was extracted"
                    },
                    "extraction_method": {
                        "type": "string",
                        "description": "Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)"
                    },
                    "is_terminal": {
                        "type": "boolean",
                        "description": "Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)"
                    }
                }
            }
        },
        "evaluation": {
            "type": "object",
            "description": "Evaluation results and scoring data",
            "required": ["score", "is_correct"],
            "properties": {
                "score": {
                    "type": ["number", "boolean"],
                    "description": "Instance-level score"
                },
                "is_correct": {
                    "type": "boolean",
                    "description": "Whether the final answer is correct"
                },
                "num_turns": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "Number of turns in the interaction"
                },
                "tool_calls_count": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "Count of tool calls across all turns in interactions"
                }
            }
        },
        "token_usage": {
            "type": ["object", "null"],
            "description": "Token usage for the model completion",
            "required": ["input_tokens", "output_tokens", "total_tokens"],
            "properties": {
                "input_tokens": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "Total input tokens used"
                },
                "output_tokens": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "Total output tokens used"
                },
                "total_tokens": {
                    "type": "integer",
                    "minimum": 0,
                    "description": "Total tokens used"
                },
                "input_tokens_cache_write": {
                    "type": ["integer", "null"],
                    "minimum": 0,
                    "description": "Number of tokens written to the cache"
                },
                "input_tokens_cache_read": {
                    "type": ["integer", "null"],
                    "minimum": 0,
                    "description": "Number of tokens retrieved from the cache"
                },
                "reasoning_tokens": {
                    "type": ["integer", "null"],
                    "minimum": 0,
                    "description": "Number of tokens used for reasoning"
                }
            }
        },
        "performance": {
            "type": ["object", "null"],
            "description": "Performance and latency metrics",
            "properties": {
                "latency_ms": {
                    "type": ["number", "null"],
                    "minimum": 0,
                    "description": "Total latency in milliseconds"
                },
                "time_to_first_token_ms": {
                    "type": ["number", "null"],
                    "minimum": 0,
                    "description": "Time to first token in milliseconds"
                },
                "generation_time_ms": {
                    "type": ["number", "null"],
                    "minimum": 0,
                    "description": "Time for generation in milliseconds"
                }
            },
            "additionalProperties": true
        },
        "error": {
            "type": ["string", "null"],
            "description": "Information about any error that occurred (e.g. timeout, refusal, API error)"
        },
        "metadata": {
            "type": "object",
            "description": "Optional metadata about the sample (e.g. subject, difficulty, tags)",
            "additionalProperties": true
        }
    },
    "allOf": [
        {
            "if": {
                "properties": {
                    "interaction_type": {
                        "const": "single_turn"
                    }
                }
            },
            "then": {
                "required": ["output"],
                "properties": {
                    "output": {
                        "type": "object",
                        "not": {
                            "type": "null"
                        }
                    },
                    "interactions": {
                        "type": "null"
                    }
                }
            }
        },
        {
            "if": {
                "properties": {
                    "interaction_type": {
                        "enum": ["multi_turn", "agentic"]
                    }
                }
            },
            "then": {
                "required": ["interactions"],
                "properties": {
                    "output": {
                        "type": "null"
                    },
                    "interactions": {
                        "type": "array",
                        "not": {
                            "type": "null"
                        }
                    },
                    "metrics": {
                        "required": ["num_turns"]
                    }
                }
            }
        }
    ]
}