File size: 2,019 Bytes
6597245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
{
  "summary": {
    "cross_lingual_pass_rate": 0.0,
    "difficult_cases_pass_rate": 0.75,
    "edge_cases_pass_rate": 0.0
  },
  "cross_lingual": [
    {
      "test": "FR→EN instruction",
      "success": false,
      "score_diff": -0.06680182105237953
    },
    {
      "test": "EN→FR instruction",
      "success": false,
      "score_diff": -0.21303130042796392
    },
    {
      "test": "FR→Multilingual",
      "success": false,
      "score_diff": -0.3979676336355793
    }
  ],
  "difficult_cases": [
    {
      "test": "Negative instruction (Avoid)",
      "success": true,
      "score_diff": 0.0
    },
    {
      "test": "Ambiguous: Train",
      "success": false,
      "score_diff": -0.013746112646522035
    },
    {
      "test": "Multiple intentions",
      "success": true,
      "score_diff": 0.0
    },
    {
      "test": "Formality matching",
      "success": true,
      "score_diff": 0.007767340301580772
    }
  ],
  "edge_cases": [
    {
      "test": "Spelling errors",
      "success": false,
      "score_diff": -0.023126566420730188
    },
    {
      "test": "Very long query",
      "success": false,
      "score_diff": -0.06509758680256694
    },
    {
      "test": "Contradictory instructions",
      "success": false,
      "score_diff": -0.02864061742806956
    },
    {
      "test": "Non-Latin scripts",
      "success": false,
      "details": {
        "Arabic": false,
        "Russian": false,
        "Chinese": false
      }
    }
  ],
  "degradation": [
    {
      "test": "Simple EN instruction",
      "score": 0.9339406309985464,
      "margin": -0.009695165515504423
    },
    {
      "test": "Cross-lingual FR→EN",
      "score": 0.5904816604785096,
      "margin": -0.0021998562159204482
    },
    {
      "test": "Cross-lingual with typos",
      "score": 0.5781216603117493,
      "margin": 0.010975424877498807
    },
    {
      "test": "Long cross-lingual query",
      "score": 0.56935017490961,
      "margin": 0.02394839991605835
    }
  ]
}