qwen25-deposium-1024d / examples /test_results_advanced.json
tss-deposium's picture
Upload 8 files
6597245 verified
raw
history blame
2.02 kB
{
"summary": {
"cross_lingual_pass_rate": 0.0,
"difficult_cases_pass_rate": 0.75,
"edge_cases_pass_rate": 0.0
},
"cross_lingual": [
{
"test": "FR→EN instruction",
"success": false,
"score_diff": -0.06680182105237953
},
{
"test": "EN→FR instruction",
"success": false,
"score_diff": -0.21303130042796392
},
{
"test": "FR→Multilingual",
"success": false,
"score_diff": -0.3979676336355793
}
],
"difficult_cases": [
{
"test": "Negative instruction (Avoid)",
"success": true,
"score_diff": 0.0
},
{
"test": "Ambiguous: Train",
"success": false,
"score_diff": -0.013746112646522035
},
{
"test": "Multiple intentions",
"success": true,
"score_diff": 0.0
},
{
"test": "Formality matching",
"success": true,
"score_diff": 0.007767340301580772
}
],
"edge_cases": [
{
"test": "Spelling errors",
"success": false,
"score_diff": -0.023126566420730188
},
{
"test": "Very long query",
"success": false,
"score_diff": -0.06509758680256694
},
{
"test": "Contradictory instructions",
"success": false,
"score_diff": -0.02864061742806956
},
{
"test": "Non-Latin scripts",
"success": false,
"details": {
"Arabic": false,
"Russian": false,
"Chinese": false
}
}
],
"degradation": [
{
"test": "Simple EN instruction",
"score": 0.9339406309985464,
"margin": -0.009695165515504423
},
{
"test": "Cross-lingual FR→EN",
"score": 0.5904816604785096,
"margin": -0.0021998562159204482
},
{
"test": "Cross-lingual with typos",
"score": 0.5781216603117493,
"margin": 0.010975424877498807
},
{
"test": "Long cross-lingual query",
"score": 0.56935017490961,
"margin": 0.02394839991605835
}
]
}