{ "summary": { "cross_lingual_pass_rate": 0.0, "difficult_cases_pass_rate": 0.75, "edge_cases_pass_rate": 0.0 }, "cross_lingual": [ { "test": "FR→EN instruction", "success": false, "score_diff": -0.06680182105237953 }, { "test": "EN→FR instruction", "success": false, "score_diff": -0.21303130042796392 }, { "test": "FR→Multilingual", "success": false, "score_diff": -0.3979676336355793 } ], "difficult_cases": [ { "test": "Negative instruction (Avoid)", "success": true, "score_diff": 0.0 }, { "test": "Ambiguous: Train", "success": false, "score_diff": -0.013746112646522035 }, { "test": "Multiple intentions", "success": true, "score_diff": 0.0 }, { "test": "Formality matching", "success": true, "score_diff": 0.007767340301580772 } ], "edge_cases": [ { "test": "Spelling errors", "success": false, "score_diff": -0.023126566420730188 }, { "test": "Very long query", "success": false, "score_diff": -0.06509758680256694 }, { "test": "Contradictory instructions", "success": false, "score_diff": -0.02864061742806956 }, { "test": "Non-Latin scripts", "success": false, "details": { "Arabic": false, "Russian": false, "Chinese": false } } ], "degradation": [ { "test": "Simple EN instruction", "score": 0.9339406309985464, "margin": -0.009695165515504423 }, { "test": "Cross-lingual FR→EN", "score": 0.5904816604785096, "margin": -0.0021998562159204482 }, { "test": "Cross-lingual with typos", "score": 0.5781216603117493, "margin": 0.010975424877498807 }, { "test": "Long cross-lingual query", "score": 0.56935017490961, "margin": 0.02394839991605835 } ] }