| { | |
| "summary": { | |
| "cross_lingual_pass_rate": 0.0, | |
| "difficult_cases_pass_rate": 0.75, | |
| "edge_cases_pass_rate": 0.0 | |
| }, | |
| "cross_lingual": [ | |
| { | |
| "test": "FR→EN instruction", | |
| "success": false, | |
| "score_diff": -0.06680182105237953 | |
| }, | |
| { | |
| "test": "EN→FR instruction", | |
| "success": false, | |
| "score_diff": -0.21303130042796392 | |
| }, | |
| { | |
| "test": "FR→Multilingual", | |
| "success": false, | |
| "score_diff": -0.3979676336355793 | |
| } | |
| ], | |
| "difficult_cases": [ | |
| { | |
| "test": "Negative instruction (Avoid)", | |
| "success": true, | |
| "score_diff": 0.0 | |
| }, | |
| { | |
| "test": "Ambiguous: Train", | |
| "success": false, | |
| "score_diff": -0.013746112646522035 | |
| }, | |
| { | |
| "test": "Multiple intentions", | |
| "success": true, | |
| "score_diff": 0.0 | |
| }, | |
| { | |
| "test": "Formality matching", | |
| "success": true, | |
| "score_diff": 0.007767340301580772 | |
| } | |
| ], | |
| "edge_cases": [ | |
| { | |
| "test": "Spelling errors", | |
| "success": false, | |
| "score_diff": -0.023126566420730188 | |
| }, | |
| { | |
| "test": "Very long query", | |
| "success": false, | |
| "score_diff": -0.06509758680256694 | |
| }, | |
| { | |
| "test": "Contradictory instructions", | |
| "success": false, | |
| "score_diff": -0.02864061742806956 | |
| }, | |
| { | |
| "test": "Non-Latin scripts", | |
| "success": false, | |
| "details": { | |
| "Arabic": false, | |
| "Russian": false, | |
| "Chinese": false | |
| } | |
| } | |
| ], | |
| "degradation": [ | |
| { | |
| "test": "Simple EN instruction", | |
| "score": 0.9339406309985464, | |
| "margin": -0.009695165515504423 | |
| }, | |
| { | |
| "test": "Cross-lingual FR→EN", | |
| "score": 0.5904816604785096, | |
| "margin": -0.0021998562159204482 | |
| }, | |
| { | |
| "test": "Cross-lingual with typos", | |
| "score": 0.5781216603117493, | |
| "margin": 0.010975424877498807 | |
| }, | |
| { | |
| "test": "Long cross-lingual query", | |
| "score": 0.56935017490961, | |
| "margin": 0.02394839991605835 | |
| } | |
| ] | |
| } |