Upload 8 files
Browse files- examples/advanced_limits_testing.py +593 -0
- examples/advanced_test_output.log +313 -0
- examples/monolingual_test_output.log +260 -0
- examples/monolingual_test_results.json +139 -0
- examples/monolingual_testing.py +465 -0
- examples/test_results_advanced.json +94 -0
examples/advanced_limits_testing.py
ADDED
|
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Advanced Limits Testing: qwen25-deposium-1024d
|
| 4 |
+
|
| 5 |
+
This script pushes the model to its limits to discover:
|
| 6 |
+
1. Cross-lingual instruction-awareness (FR→EN, EN→FR, mixed)
|
| 7 |
+
2. Difficult and ambiguous cases
|
| 8 |
+
3. Edge cases and failure modes
|
| 9 |
+
4. Performance degradation thresholds
|
| 10 |
+
|
| 11 |
+
Goal: Be HONEST about limitations for HuggingFace publication
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from model2vec import StaticModel
|
| 15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 16 |
+
import numpy as np
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def print_header(text, level=1):
|
| 20 |
+
"""Print formatted header"""
|
| 21 |
+
if level == 1:
|
| 22 |
+
print("\n" + "=" * 80)
|
| 23 |
+
print(f" {text}")
|
| 24 |
+
print("=" * 80)
|
| 25 |
+
else:
|
| 26 |
+
print(f"\n{'─' * 80}")
|
| 27 |
+
print(f" {text}")
|
| 28 |
+
print('─' * 80)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_ranking(model, query, docs, expected_rank=0, description=""):
|
| 32 |
+
"""
|
| 33 |
+
Test document ranking
|
| 34 |
+
Returns (success, top_doc_index, scores, analysis)
|
| 35 |
+
"""
|
| 36 |
+
if description:
|
| 37 |
+
print(f"\n{description}")
|
| 38 |
+
|
| 39 |
+
print(f"\n📝 Query: \"{query}\"")
|
| 40 |
+
print(f"\n📄 Documents:")
|
| 41 |
+
|
| 42 |
+
query_emb = model.encode([query])[0]
|
| 43 |
+
doc_embs = model.encode(docs)
|
| 44 |
+
|
| 45 |
+
similarities = cosine_similarity([query_emb], doc_embs)[0]
|
| 46 |
+
sorted_indices = np.argsort(similarities)[::-1]
|
| 47 |
+
|
| 48 |
+
for i, idx in enumerate(sorted_indices, 1):
|
| 49 |
+
score = similarities[idx]
|
| 50 |
+
doc = docs[idx]
|
| 51 |
+
|
| 52 |
+
# Check if this is expected top result
|
| 53 |
+
if idx == expected_rank:
|
| 54 |
+
emoji = "✅" if i == 1 else "❌"
|
| 55 |
+
else:
|
| 56 |
+
emoji = "⚪"
|
| 57 |
+
|
| 58 |
+
print(f" {i}. {emoji} [{score:.3f}] {doc}")
|
| 59 |
+
|
| 60 |
+
success = sorted_indices[0] == expected_rank
|
| 61 |
+
top_score = similarities[sorted_indices[0]]
|
| 62 |
+
expected_score = similarities[expected_rank]
|
| 63 |
+
score_diff = expected_score - top_score
|
| 64 |
+
|
| 65 |
+
return success, sorted_indices[0], similarities, {
|
| 66 |
+
'success': success,
|
| 67 |
+
'top_score': top_score,
|
| 68 |
+
'expected_score': expected_score,
|
| 69 |
+
'score_diff': score_diff
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def main():
|
| 74 |
+
print_header("🧪 ADVANCED LIMITS TESTING: qwen25-deposium-1024d")
|
| 75 |
+
|
| 76 |
+
print("\n🔄 Loading model...")
|
| 77 |
+
model = StaticModel.from_pretrained("tss-deposium/qwen25-deposium-1024d")
|
| 78 |
+
print("✅ Model loaded!\n")
|
| 79 |
+
|
| 80 |
+
# Track results
|
| 81 |
+
results = {
|
| 82 |
+
'cross_lingual': [],
|
| 83 |
+
'difficult_cases': [],
|
| 84 |
+
'edge_cases': [],
|
| 85 |
+
'failures': []
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
# ========================================================================
|
| 89 |
+
# PART 1: Cross-Lingual Instruction-Awareness
|
| 90 |
+
# ========================================================================
|
| 91 |
+
print_header("🌍 PART 1: Cross-Lingual Instruction-Awareness", level=1)
|
| 92 |
+
|
| 93 |
+
# Test 1.1: French query → English documents
|
| 94 |
+
print_header("Test 1.1: Question FR → Documents EN", level=2)
|
| 95 |
+
|
| 96 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 97 |
+
model,
|
| 98 |
+
query="Explique comment fonctionnent les réseaux de neurones", # FR
|
| 99 |
+
docs=[
|
| 100 |
+
"Neural networks explanation tutorial and comprehensive guide", # EN - Should match
|
| 101 |
+
"Neural network architecture overview and history", # EN - Lower
|
| 102 |
+
"Comment installer TensorFlow sur Ubuntu", # FR - Wrong topic
|
| 103 |
+
],
|
| 104 |
+
expected_rank=0,
|
| 105 |
+
description="Can the model understand FR 'Explique' → EN 'explanation tutorial'?"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
results['cross_lingual'].append({
|
| 109 |
+
'test': 'FR→EN instruction',
|
| 110 |
+
'success': success,
|
| 111 |
+
'score_diff': analysis['score_diff']
|
| 112 |
+
})
|
| 113 |
+
|
| 114 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Cross-lingual instruction matching")
|
| 115 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 116 |
+
|
| 117 |
+
# Test 1.2: English query → French documents
|
| 118 |
+
print_header("Test 1.2: Question EN → Documents FR", level=2)
|
| 119 |
+
|
| 120 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 121 |
+
model,
|
| 122 |
+
query="Find articles about climate change", # EN
|
| 123 |
+
docs=[
|
| 124 |
+
"Articles sur le changement climatique et publications scientifiques", # FR - Should match
|
| 125 |
+
"Le changement climatique est un problème majeur", # FR - Lower
|
| 126 |
+
"Climate change scientific research overview", # EN - Wrong intent
|
| 127 |
+
],
|
| 128 |
+
expected_rank=0,
|
| 129 |
+
description="Can the model understand EN 'Find articles' → FR 'Articles ... publications'?"
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
results['cross_lingual'].append({
|
| 133 |
+
'test': 'EN→FR instruction',
|
| 134 |
+
'success': success,
|
| 135 |
+
'score_diff': analysis['score_diff']
|
| 136 |
+
})
|
| 137 |
+
|
| 138 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Cross-lingual instruction matching")
|
| 139 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 140 |
+
|
| 141 |
+
# Test 1.3: French query → Mixed language documents
|
| 142 |
+
print_header("Test 1.3: Question FR → Documents Multilingues", level=2)
|
| 143 |
+
|
| 144 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 145 |
+
model,
|
| 146 |
+
query="Résume les avantages de l'apprentissage profond", # FR: Summarize deep learning advantages
|
| 147 |
+
docs=[
|
| 148 |
+
"Deep learning advantages summary: fast, accurate, scalable", # EN - Should match
|
| 149 |
+
"Resumen de las ventajas del aprendizaje profundo", # ES - Also good
|
| 150 |
+
"L'apprentissage profond est une technique d'IA", # FR - Descriptive, not summary
|
| 151 |
+
"Zusammenfassung der Vorteile des Deep Learning", # DE - Also good
|
| 152 |
+
],
|
| 153 |
+
expected_rank=0,
|
| 154 |
+
description="FR 'Résume' → EN 'summary' (mixed FR/EN/ES/DE results)"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
results['cross_lingual'].append({
|
| 158 |
+
'test': 'FR→Multilingual',
|
| 159 |
+
'success': success,
|
| 160 |
+
'score_diff': analysis['score_diff']
|
| 161 |
+
})
|
| 162 |
+
|
| 163 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Multilingual instruction matching")
|
| 164 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 165 |
+
|
| 166 |
+
# ========================================================================
|
| 167 |
+
# PART 2: Difficult and Ambiguous Cases
|
| 168 |
+
# ========================================================================
|
| 169 |
+
print_header("🤔 PART 2: Difficult and Ambiguous Cases", level=1)
|
| 170 |
+
|
| 171 |
+
# Test 2.1: Negative instructions
|
| 172 |
+
print_header("Test 2.1: Instructions Négatives", level=2)
|
| 173 |
+
|
| 174 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 175 |
+
model,
|
| 176 |
+
query="Avoid using neural networks for this task",
|
| 177 |
+
docs=[
|
| 178 |
+
"Alternative methods to neural networks: decision trees, random forests", # Correct
|
| 179 |
+
"Neural network implementation guide and tutorial", # Opposite
|
| 180 |
+
"When not to use machine learning algorithms", # Related
|
| 181 |
+
],
|
| 182 |
+
expected_rank=0,
|
| 183 |
+
description="Does the model understand 'Avoid' correctly?"
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
results['difficult_cases'].append({
|
| 187 |
+
'test': 'Negative instruction (Avoid)',
|
| 188 |
+
'success': success,
|
| 189 |
+
'score_diff': analysis['score_diff']
|
| 190 |
+
})
|
| 191 |
+
|
| 192 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Negative instruction understanding")
|
| 193 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 194 |
+
|
| 195 |
+
# Test 2.2: Ambiguous instructions
|
| 196 |
+
print_header("Test 2.2: Instructions Ambiguës", level=2)
|
| 197 |
+
|
| 198 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 199 |
+
model,
|
| 200 |
+
query="Train the model", # Ambiguous: train ML model? or train a person?
|
| 201 |
+
docs=[
|
| 202 |
+
"Machine learning model training procedures and optimization", # ML interpretation
|
| 203 |
+
"Employee training program for new hires", # HR interpretation
|
| 204 |
+
"Train scheduling and railway timetables", # Transport interpretation
|
| 205 |
+
],
|
| 206 |
+
expected_rank=0, # We expect ML interpretation (most common in tech context)
|
| 207 |
+
description="'Train the model' - Does it default to ML context?"
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
results['difficult_cases'].append({
|
| 211 |
+
'test': 'Ambiguous: Train',
|
| 212 |
+
'success': success,
|
| 213 |
+
'score_diff': analysis['score_diff']
|
| 214 |
+
})
|
| 215 |
+
|
| 216 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Ambiguity resolution (ML context)")
|
| 217 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 218 |
+
|
| 219 |
+
# Test 2.3: Multiple intentions in one query
|
| 220 |
+
print_header("Test 2.3: Instructions Multiples", level=2)
|
| 221 |
+
|
| 222 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 223 |
+
model,
|
| 224 |
+
query="Find, compare and summarize articles about quantum computing",
|
| 225 |
+
docs=[
|
| 226 |
+
"Quantum computing articles comparison summary: top papers analyzed", # All 3 intents
|
| 227 |
+
"Quantum computing research articles and publications", # Find only
|
| 228 |
+
"Quantum computing summary and overview", # Summarize only
|
| 229 |
+
"GPT-3 vs GPT-4 comparison summary", # Compare + summarize, wrong topic
|
| 230 |
+
],
|
| 231 |
+
expected_rank=0,
|
| 232 |
+
description="Multiple intents: Find + Compare + Summarize"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
results['difficult_cases'].append({
|
| 236 |
+
'test': 'Multiple intentions',
|
| 237 |
+
'success': success,
|
| 238 |
+
'score_diff': analysis['score_diff']
|
| 239 |
+
})
|
| 240 |
+
|
| 241 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Multiple intentions handling")
|
| 242 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 243 |
+
|
| 244 |
+
# Test 2.4: Formal vs Informal
|
| 245 |
+
print_header("Test 2.4: Nuances Formelles vs Informelles", level=2)
|
| 246 |
+
|
| 247 |
+
# Test if model distinguishes formality
|
| 248 |
+
query_formal = "Please provide a comprehensive explanation of quantum mechanics"
|
| 249 |
+
query_informal = "Yo, explain quantum stuff to me"
|
| 250 |
+
|
| 251 |
+
doc_formal = "Quantum mechanics: comprehensive theoretical framework and mathematical foundations"
|
| 252 |
+
doc_informal = "Quantum physics explained simply: easy guide for beginners"
|
| 253 |
+
|
| 254 |
+
emb_formal_query = model.encode([query_formal])[0]
|
| 255 |
+
emb_informal_query = model.encode([query_informal])[0]
|
| 256 |
+
emb_formal_doc = model.encode([doc_formal])[0]
|
| 257 |
+
emb_informal_doc = model.encode([doc_informal])[0]
|
| 258 |
+
|
| 259 |
+
formal_formal = cosine_similarity([emb_formal_query], [emb_formal_doc])[0][0]
|
| 260 |
+
formal_informal = cosine_similarity([emb_formal_query], [emb_informal_doc])[0][0]
|
| 261 |
+
informal_formal = cosine_similarity([emb_informal_query], [emb_formal_doc])[0][0]
|
| 262 |
+
informal_informal = cosine_similarity([emb_informal_query], [emb_informal_doc])[0][0]
|
| 263 |
+
|
| 264 |
+
print(f"\nFormal query → Formal doc: {formal_formal:.3f}")
|
| 265 |
+
print(f"Formal query → Informal doc: {formal_informal:.3f}")
|
| 266 |
+
print(f"Informal query → Formal doc: {informal_formal:.3f}")
|
| 267 |
+
print(f"Informal query → Informal doc: {informal_informal:.3f}")
|
| 268 |
+
|
| 269 |
+
# Check if formality matching exists
|
| 270 |
+
formality_aware = (formal_formal > formal_informal) and (informal_informal > informal_formal)
|
| 271 |
+
|
| 272 |
+
results['difficult_cases'].append({
|
| 273 |
+
'test': 'Formality matching',
|
| 274 |
+
'success': formality_aware,
|
| 275 |
+
'score_diff': (formal_formal - formal_informal) if formality_aware else (formal_informal - formal_formal)
|
| 276 |
+
})
|
| 277 |
+
|
| 278 |
+
print(f"\n{'✅ PASS' if formality_aware else '❌ FAIL'}: Formality awareness")
|
| 279 |
+
|
| 280 |
+
# ========================================================================
|
| 281 |
+
# PART 3: Edge Cases and Failure Modes
|
| 282 |
+
# ========================================================================
|
| 283 |
+
print_header("⚠️ PART 3: Edge Cases and Failure Modes", level=1)
|
| 284 |
+
|
| 285 |
+
# Test 3.1: Typos and spelling errors
|
| 286 |
+
print_header("Test 3.1: Fautes d'Orthographe", level=2)
|
| 287 |
+
|
| 288 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 289 |
+
model,
|
| 290 |
+
query="Explan how nural netwrks wrk", # Multiple typos
|
| 291 |
+
docs=[
|
| 292 |
+
"Neural networks explanation tutorial and comprehensive guide",
|
| 293 |
+
"Neural network architecture technical specifications",
|
| 294 |
+
"How to install neural network frameworks",
|
| 295 |
+
],
|
| 296 |
+
expected_rank=0,
|
| 297 |
+
description="Query with typos: 'Explan', 'nural', 'netwrks', 'wrk'"
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
results['edge_cases'].append({
|
| 301 |
+
'test': 'Spelling errors',
|
| 302 |
+
'success': success,
|
| 303 |
+
'score_diff': analysis['score_diff']
|
| 304 |
+
})
|
| 305 |
+
|
| 306 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Typo robustness")
|
| 307 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 308 |
+
|
| 309 |
+
# Test 3.2: Very long and complex query
|
| 310 |
+
print_header("Test 3.2: Requête Très Longue et Complexe", level=2)
|
| 311 |
+
|
| 312 |
+
long_query = """
|
| 313 |
+
I need to find comprehensive research articles and academic papers that provide
|
| 314 |
+
a detailed explanation and thorough comparison of different neural network
|
| 315 |
+
architectures, specifically comparing convolutional neural networks, recurrent
|
| 316 |
+
neural networks, and transformer-based models, with a focus on their practical
|
| 317 |
+
applications in natural language processing, computer vision, and time series
|
| 318 |
+
prediction tasks, including performance benchmarks and computational efficiency
|
| 319 |
+
analysis.
|
| 320 |
+
"""
|
| 321 |
+
|
| 322 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 323 |
+
model,
|
| 324 |
+
query=long_query.strip(),
|
| 325 |
+
docs=[
|
| 326 |
+
"Neural network architectures comparison: CNN, RNN, Transformers for NLP, vision, time series",
|
| 327 |
+
"Neural networks overview and basic introduction",
|
| 328 |
+
"Deep learning frameworks installation guide",
|
| 329 |
+
],
|
| 330 |
+
expected_rank=0,
|
| 331 |
+
description="Very long query (71 words) with multiple intents"
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
results['edge_cases'].append({
|
| 335 |
+
'test': 'Very long query',
|
| 336 |
+
'success': success,
|
| 337 |
+
'score_diff': analysis['score_diff']
|
| 338 |
+
})
|
| 339 |
+
|
| 340 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Long query handling")
|
| 341 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 342 |
+
|
| 343 |
+
# Test 3.3: Contradictory instructions
|
| 344 |
+
print_header("Test 3.3: Instructions Contradictoires", level=2)
|
| 345 |
+
|
| 346 |
+
success, top_idx, scores, analysis = test_ranking(
|
| 347 |
+
model,
|
| 348 |
+
query="Explain in detail but keep it brief", # Contradiction
|
| 349 |
+
docs=[
|
| 350 |
+
"Quick overview and brief summary of the topic", # Brief
|
| 351 |
+
"Comprehensive detailed explanation with examples", # Detailed
|
| 352 |
+
"Medium-length explanation with key points", # Balanced
|
| 353 |
+
],
|
| 354 |
+
expected_rank=2, # Expect balanced approach
|
| 355 |
+
description="Contradictory: 'in detail' vs 'keep it brief'"
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
results['edge_cases'].append({
|
| 359 |
+
'test': 'Contradictory instructions',
|
| 360 |
+
'success': success,
|
| 361 |
+
'score_diff': analysis['score_diff']
|
| 362 |
+
})
|
| 363 |
+
|
| 364 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: Contradiction handling (balanced)")
|
| 365 |
+
print(f" Score difference: {analysis['score_diff']:.3f}")
|
| 366 |
+
|
| 367 |
+
# Test 3.4: Non-Latin scripts (if model supports)
|
| 368 |
+
print_header("Test 3.4: Scripts Non-Latins", level=2)
|
| 369 |
+
|
| 370 |
+
# Arabic
|
| 371 |
+
success_ar, top_idx_ar, scores_ar, analysis_ar = test_ranking(
|
| 372 |
+
model,
|
| 373 |
+
query="اشرح كيف تعمل الشبكات العصبية", # Arabic: Explain how neural networks work
|
| 374 |
+
docs=[
|
| 375 |
+
"Neural networks explanation tutorial comprehensive guide",
|
| 376 |
+
"شبكات عصبية معمارية عامة", # Arabic: Neural networks general architecture
|
| 377 |
+
"Neural network training procedures",
|
| 378 |
+
],
|
| 379 |
+
expected_rank=0,
|
| 380 |
+
description="Arabic query → English documents"
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
# Russian
|
| 384 |
+
success_ru, top_idx_ru, scores_ru, analysis_ru = test_ranking(
|
| 385 |
+
model,
|
| 386 |
+
query="Объясни, как работают нейронные сети", # Russian: Explain how neural networks work
|
| 387 |
+
docs=[
|
| 388 |
+
"Neural networks explanation tutorial comprehensive guide",
|
| 389 |
+
"Нейронные сети архитектура обзор", # Russian: Neural networks architecture overview
|
| 390 |
+
"Neural network training procedures",
|
| 391 |
+
],
|
| 392 |
+
expected_rank=0,
|
| 393 |
+
description="Russian query → English documents"
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
# Chinese
|
| 397 |
+
success_zh, top_idx_zh, scores_zh, analysis_zh = test_ranking(
|
| 398 |
+
model,
|
| 399 |
+
query="解释神经网络如何工作", # Chinese: Explain how neural networks work
|
| 400 |
+
docs=[
|
| 401 |
+
"Neural networks explanation tutorial comprehensive guide",
|
| 402 |
+
"神经网络架构概述", # Chinese: Neural network architecture overview
|
| 403 |
+
"Neural network training procedures",
|
| 404 |
+
],
|
| 405 |
+
expected_rank=0,
|
| 406 |
+
description="Chinese query → English documents"
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
results['edge_cases'].append({
|
| 410 |
+
'test': 'Non-Latin scripts',
|
| 411 |
+
'success': success_ar and success_ru and success_zh,
|
| 412 |
+
'details': {
|
| 413 |
+
'Arabic': success_ar,
|
| 414 |
+
'Russian': success_ru,
|
| 415 |
+
'Chinese': success_zh
|
| 416 |
+
}
|
| 417 |
+
})
|
| 418 |
+
|
| 419 |
+
print(f"\n{'✅ PASS' if (success_ar and success_ru and success_zh) else '⚠️ PARTIAL'}: Non-Latin script support")
|
| 420 |
+
print(f" Arabic: {'✅' if success_ar else '❌'} | Russian: {'✅' if success_ru else '❌'} | Chinese: {'✅' if success_zh else '❌'}")
|
| 421 |
+
|
| 422 |
+
# ========================================================================
|
| 423 |
+
# PART 4: Performance Degradation Analysis
|
| 424 |
+
# ========================================================================
|
| 425 |
+
print_header("📊 PART 4: Performance Degradation Analysis", level=1)
|
| 426 |
+
|
| 427 |
+
# Test simple → complex progression
|
| 428 |
+
test_cases = [
|
| 429 |
+
{
|
| 430 |
+
'name': 'Simple EN instruction',
|
| 431 |
+
'query': 'Explain neural networks',
|
| 432 |
+
'doc_correct': 'Neural networks explanation tutorial',
|
| 433 |
+
'doc_wrong': 'Neural networks architecture overview'
|
| 434 |
+
},
|
| 435 |
+
{
|
| 436 |
+
'name': 'Cross-lingual FR→EN',
|
| 437 |
+
'query': 'Explique les réseaux de neurones',
|
| 438 |
+
'doc_correct': 'Neural networks explanation tutorial',
|
| 439 |
+
'doc_wrong': 'Neural networks architecture overview'
|
| 440 |
+
},
|
| 441 |
+
{
|
| 442 |
+
'name': 'Cross-lingual with typos',
|
| 443 |
+
'query': 'Explik les rezos de neurones',
|
| 444 |
+
'doc_correct': 'Neural networks explanation tutorial',
|
| 445 |
+
'doc_wrong': 'Neural networks architecture overview'
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
'name': 'Long cross-lingual query',
|
| 449 |
+
'query': 'Je cherche des articles détaillés qui expliquent comment fonctionnent les réseaux de neurones',
|
| 450 |
+
'doc_correct': 'Neural networks explanation tutorial',
|
| 451 |
+
'doc_wrong': 'Neural networks architecture overview'
|
| 452 |
+
}
|
| 453 |
+
]
|
| 454 |
+
|
| 455 |
+
print("\nProgressive difficulty test:\n")
|
| 456 |
+
|
| 457 |
+
degradation_scores = []
|
| 458 |
+
|
| 459 |
+
for i, test_case in enumerate(test_cases, 1):
|
| 460 |
+
emb_query = model.encode([test_case['query']])[0]
|
| 461 |
+
emb_correct = model.encode([test_case['doc_correct']])[0]
|
| 462 |
+
emb_wrong = model.encode([test_case['doc_wrong']])[0]
|
| 463 |
+
|
| 464 |
+
score_correct = cosine_similarity([emb_query], [emb_correct])[0][0]
|
| 465 |
+
score_wrong = cosine_similarity([emb_query], [emb_wrong])[0][0]
|
| 466 |
+
margin = score_correct - score_wrong
|
| 467 |
+
|
| 468 |
+
degradation_scores.append({
|
| 469 |
+
'test': test_case['name'],
|
| 470 |
+
'score': score_correct,
|
| 471 |
+
'margin': margin
|
| 472 |
+
})
|
| 473 |
+
|
| 474 |
+
emoji = "🟢" if margin > 0.10 else "🟡" if margin > 0.05 else "🔴"
|
| 475 |
+
|
| 476 |
+
print(f"{emoji} {i}. {test_case['name']}")
|
| 477 |
+
print(f" Score: {score_correct:.3f} | Margin: {margin:.3f}")
|
| 478 |
+
|
| 479 |
+
# Calculate degradation
|
| 480 |
+
baseline_score = degradation_scores[0]['score']
|
| 481 |
+
print(f"\n📉 Performance Degradation:")
|
| 482 |
+
for score_data in degradation_scores[1:]:
|
| 483 |
+
degradation = baseline_score - score_data['score']
|
| 484 |
+
pct = (degradation / baseline_score) * 100
|
| 485 |
+
print(f" {score_data['test']}: -{degradation:.3f} ({pct:.1f}% drop)")
|
| 486 |
+
|
| 487 |
+
# ========================================================================
|
| 488 |
+
# FINAL SUMMARY
|
| 489 |
+
# ========================================================================
|
| 490 |
+
print_header("📈 FINAL SUMMARY: Limits and Capabilities", level=1)
|
| 491 |
+
|
| 492 |
+
# Calculate pass rates
|
| 493 |
+
cross_lingual_pass = sum(1 for r in results['cross_lingual'] if r['success']) / len(results['cross_lingual'])
|
| 494 |
+
difficult_pass = sum(1 for r in results['difficult_cases'] if r['success']) / len(results['difficult_cases'])
|
| 495 |
+
edge_pass = sum(1 for r in results['edge_cases'] if r['success']) / len(results['edge_cases'])
|
| 496 |
+
|
| 497 |
+
print(f"""
|
| 498 |
+
╔══════════════════════════════════════════════════════════════════════════════╗
|
| 499 |
+
║ TEST RESULTS SUMMARY ║
|
| 500 |
+
╚══════════════════════════════════════════════════════════════════════════════╝
|
| 501 |
+
|
| 502 |
+
✅ STRENGTHS (What Works Well):
|
| 503 |
+
|
| 504 |
+
🌍 Cross-Lingual Instruction-Awareness: {cross_lingual_pass*100:.0f}% pass rate
|
| 505 |
+
• FR→EN: {'✅' if results['cross_lingual'][0]['success'] else '❌'}
|
| 506 |
+
• EN→FR: {'✅' if results['cross_lingual'][1]['success'] else '❌'}
|
| 507 |
+
• Multilingual: {'✅' if results['cross_lingual'][2]['success'] else '❌'}
|
| 508 |
+
|
| 509 |
+
🤔 Difficult Cases: {difficult_pass*100:.0f}% pass rate
|
| 510 |
+
• Negative instructions: {'✅' if results['difficult_cases'][0]['success'] else '❌'}
|
| 511 |
+
• Ambiguity resolution: {'✅' if results['difficult_cases'][1]['success'] else '❌'}
|
| 512 |
+
• Multiple intentions: {'✅' if results['difficult_cases'][2]['success'] else '❌'}
|
| 513 |
+
• Formality matching: {'✅' if results['difficult_cases'][3]['success'] else '❌'}
|
| 514 |
+
|
| 515 |
+
⚠️ LIMITATIONS (Where It Struggles):
|
| 516 |
+
|
| 517 |
+
⚠️ Edge Cases: {edge_pass*100:.0f}% pass rate
|
| 518 |
+
• Spelling errors: {'✅' if results['edge_cases'][0]['success'] else '❌'}
|
| 519 |
+
• Very long queries: {'✅' if results['edge_cases'][1]['success'] else '❌'}
|
| 520 |
+
• Contradictions: {'✅' if results['edge_cases'][2]['success'] else '❌'}
|
| 521 |
+
• Non-Latin scripts: {'⚠️ PARTIAL' if results['edge_cases'][3]['success'] else '❌'}
|
| 522 |
+
|
| 523 |
+
📉 Performance Degradation:
|
| 524 |
+
""")
|
| 525 |
+
|
| 526 |
+
for score_data in degradation_scores:
|
| 527 |
+
if score_data['test'] != 'Simple EN instruction':
|
| 528 |
+
baseline_score = degradation_scores[0]['score']
|
| 529 |
+
degradation = baseline_score - score_data['score']
|
| 530 |
+
pct = (degradation / baseline_score) * 100
|
| 531 |
+
print(f" • {score_data['test']}: -{pct:.1f}% from baseline")
|
| 532 |
+
|
| 533 |
+
print(f"""
|
| 534 |
+
🎯 RECOMMENDATIONS FOR HUGGINGFACE DOCUMENTATION:
|
| 535 |
+
|
| 536 |
+
1. ✅ HIGHLIGHT: Excellent cross-lingual instruction-awareness ({cross_lingual_pass*100:.0f}%)
|
| 537 |
+
2. ✅ HIGHLIGHT: Handles difficult cases well ({difficult_pass*100:.0f}%)
|
| 538 |
+
3. ⚠️ WARN: Moderate edge case performance ({edge_pass*100:.0f}%)
|
| 539 |
+
4. ⚠️ WARN: Performance degrades with complexity
|
| 540 |
+
5. ⚠️ WARN: Non-Latin script support varies by language
|
| 541 |
+
|
| 542 |
+
💡 HONEST ASSESSMENT:
|
| 543 |
+
This model excels at cross-lingual instruction-awareness for European
|
| 544 |
+
languages (EN/FR/ES/DE) but shows limitations with:
|
| 545 |
+
- Non-Latin scripts (Arabic, Chinese, Russian)
|
| 546 |
+
- Very complex or contradictory queries
|
| 547 |
+
- Spelling errors (though still functional)
|
| 548 |
+
|
| 549 |
+
Best use: EN/FR/ES/DE instruction-aware search and RAG systems
|
| 550 |
+
Not ideal: Non-Latin languages, highly noisy input
|
| 551 |
+
""")
|
| 552 |
+
|
| 553 |
+
# Store detailed results
|
| 554 |
+
print("\n💾 Saving detailed results to test_results.json...")
|
| 555 |
+
import json
|
| 556 |
+
|
| 557 |
+
# Convert numpy bools to Python bools for JSON serialization
|
| 558 |
+
def convert_to_json_serializable(obj):
|
| 559 |
+
"""Convert numpy types to Python types for JSON"""
|
| 560 |
+
if isinstance(obj, dict):
|
| 561 |
+
return {k: convert_to_json_serializable(v) for k, v in obj.items()}
|
| 562 |
+
elif isinstance(obj, list):
|
| 563 |
+
return [convert_to_json_serializable(item) for item in obj]
|
| 564 |
+
elif hasattr(obj, 'item'): # numpy types
|
| 565 |
+
return obj.item()
|
| 566 |
+
elif isinstance(obj, (np.bool_, bool)):
|
| 567 |
+
return bool(obj)
|
| 568 |
+
elif isinstance(obj, (np.integer, int)):
|
| 569 |
+
return int(obj)
|
| 570 |
+
elif isinstance(obj, (np.floating, float)):
|
| 571 |
+
return float(obj)
|
| 572 |
+
return obj
|
| 573 |
+
|
| 574 |
+
output = {
|
| 575 |
+
'summary': {
|
| 576 |
+
'cross_lingual_pass_rate': float(cross_lingual_pass),
|
| 577 |
+
'difficult_cases_pass_rate': float(difficult_pass),
|
| 578 |
+
'edge_cases_pass_rate': float(edge_pass)
|
| 579 |
+
},
|
| 580 |
+
'cross_lingual': convert_to_json_serializable(results['cross_lingual']),
|
| 581 |
+
'difficult_cases': convert_to_json_serializable(results['difficult_cases']),
|
| 582 |
+
'edge_cases': convert_to_json_serializable(results['edge_cases']),
|
| 583 |
+
'degradation': convert_to_json_serializable(degradation_scores)
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
with open('test_results_advanced.json', 'w', encoding='utf-8') as f:
|
| 587 |
+
json.dump(output, f, indent=2, ensure_ascii=False)
|
| 588 |
+
|
| 589 |
+
print("✅ Results saved to test_results_advanced.json")
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
if __name__ == "__main__":
|
| 593 |
+
main()
|
examples/advanced_test_output.log
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
================================================================================
|
| 3 |
+
🧪 ADVANCED LIMITS TESTING: qwen25-deposium-1024d
|
| 4 |
+
================================================================================
|
| 5 |
+
|
| 6 |
+
🔄 Loading model...
|
| 7 |
+
✅ Model loaded!
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
================================================================================
|
| 11 |
+
🌍 PART 1: Cross-Lingual Instruction-Awareness
|
| 12 |
+
================================================================================
|
| 13 |
+
|
| 14 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 15 |
+
Test 1.1: Question FR → Documents EN
|
| 16 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
Can the model understand FR 'Explique' → EN 'explanation tutorial'?
|
| 19 |
+
|
| 20 |
+
📝 Query: "Explique comment fonctionnent les réseaux de neurones"
|
| 21 |
+
|
| 22 |
+
📄 Documents:
|
| 23 |
+
1. ⚪ [0.741] Comment installer TensorFlow sur Ubuntu
|
| 24 |
+
2. ❌ [0.674] Neural networks explanation tutorial and comprehensive guide
|
| 25 |
+
3. ⚪ [0.671] Neural network architecture overview and history
|
| 26 |
+
|
| 27 |
+
❌ FAIL: Cross-lingual instruction matching
|
| 28 |
+
Score difference: -0.067
|
| 29 |
+
|
| 30 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 31 |
+
Test 1.2: Question EN → Documents FR
|
| 32 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 33 |
+
|
| 34 |
+
Can the model understand EN 'Find articles' → FR 'Articles ... publications'?
|
| 35 |
+
|
| 36 |
+
📝 Query: "Find articles about climate change"
|
| 37 |
+
|
| 38 |
+
📄 Documents:
|
| 39 |
+
1. ⚪ [0.950] Climate change scientific research overview
|
| 40 |
+
2. ❌ [0.737] Articles sur le changement climatique et publications scientifiques
|
| 41 |
+
3. ⚪ [0.646] Le changement climatique est un problème majeur
|
| 42 |
+
|
| 43 |
+
❌ FAIL: Cross-lingual instruction matching
|
| 44 |
+
Score difference: -0.213
|
| 45 |
+
|
| 46 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 47 |
+
Test 1.3: Question FR → Documents Multilingues
|
| 48 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 49 |
+
|
| 50 |
+
FR 'Résume' → EN 'summary' (mixed FR/EN/ES/DE results)
|
| 51 |
+
|
| 52 |
+
📝 Query: "Résume les avantages de l'apprentissage profond"
|
| 53 |
+
|
| 54 |
+
📄 Documents:
|
| 55 |
+
1. ⚪ [0.932] L'apprentissage profond est une technique d'IA
|
| 56 |
+
2. ⚪ [0.881] Resumen de las ventajas del aprendizaje profundo
|
| 57 |
+
3. ⚪ [0.838] Zusammenfassung der Vorteile des Deep Learning
|
| 58 |
+
4. ❌ [0.534] Deep learning advantages summary: fast, accurate, scalable
|
| 59 |
+
|
| 60 |
+
❌ FAIL: Multilingual instruction matching
|
| 61 |
+
Score difference: -0.398
|
| 62 |
+
|
| 63 |
+
================================================================================
|
| 64 |
+
🤔 PART 2: Difficult and Ambiguous Cases
|
| 65 |
+
================================================================================
|
| 66 |
+
|
| 67 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 68 |
+
Test 2.1: Instructions Négatives
|
| 69 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 70 |
+
|
| 71 |
+
Does the model understand 'Avoid' correctly?
|
| 72 |
+
|
| 73 |
+
📝 Query: "Avoid using neural networks for this task"
|
| 74 |
+
|
| 75 |
+
📄 Documents:
|
| 76 |
+
1. ✅ [0.969] Alternative methods to neural networks: decision trees, random forests
|
| 77 |
+
2. ⚪ [0.969] When not to use machine learning algorithms
|
| 78 |
+
3. ⚪ [0.958] Neural network implementation guide and tutorial
|
| 79 |
+
|
| 80 |
+
✅ PASS: Negative instruction understanding
|
| 81 |
+
Score difference: 0.000
|
| 82 |
+
|
| 83 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 84 |
+
Test 2.2: Instructions Ambiguës
|
| 85 |
+
──────────────────────────────────────────────────────────────────��─────────────
|
| 86 |
+
|
| 87 |
+
'Train the model' - Does it default to ML context?
|
| 88 |
+
|
| 89 |
+
📝 Query: "Train the model"
|
| 90 |
+
|
| 91 |
+
📄 Documents:
|
| 92 |
+
1. ⚪ [0.918] Train scheduling and railway timetables
|
| 93 |
+
2. ⚪ [0.917] Employee training program for new hires
|
| 94 |
+
3. ❌ [0.905] Machine learning model training procedures and optimization
|
| 95 |
+
|
| 96 |
+
❌ FAIL: Ambiguity resolution (ML context)
|
| 97 |
+
Score difference: -0.014
|
| 98 |
+
|
| 99 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 100 |
+
Test 2.3: Instructions Multiples
|
| 101 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 102 |
+
|
| 103 |
+
Multiple intents: Find + Compare + Summarize
|
| 104 |
+
|
| 105 |
+
📝 Query: "Find, compare and summarize articles about quantum computing"
|
| 106 |
+
|
| 107 |
+
📄 Documents:
|
| 108 |
+
1. ✅ [0.977] Quantum computing articles comparison summary: top papers analyzed
|
| 109 |
+
2. ⚪ [0.966] Quantum computing summary and overview
|
| 110 |
+
3. ⚪ [0.962] Quantum computing research articles and publications
|
| 111 |
+
4. ⚪ [0.704] GPT-3 vs GPT-4 comparison summary
|
| 112 |
+
|
| 113 |
+
✅ PASS: Multiple intentions handling
|
| 114 |
+
Score difference: 0.000
|
| 115 |
+
|
| 116 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 117 |
+
Test 2.4: Nuances Formelles vs Informelles
|
| 118 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 119 |
+
|
| 120 |
+
Formal query → Formal doc: 0.969
|
| 121 |
+
Formal query → Informal doc: 0.962
|
| 122 |
+
Informal query → Formal doc: 0.883
|
| 123 |
+
Informal query → Informal doc: 0.937
|
| 124 |
+
|
| 125 |
+
✅ PASS: Formality awareness
|
| 126 |
+
|
| 127 |
+
================================================================================
|
| 128 |
+
⚠️ PART 3: Edge Cases and Failure Modes
|
| 129 |
+
================================================================================
|
| 130 |
+
|
| 131 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 132 |
+
Test 3.1: Fautes d'Orthographe
|
| 133 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 134 |
+
|
| 135 |
+
Query with typos: 'Explan', 'nural', 'netwrks', 'wrk'
|
| 136 |
+
|
| 137 |
+
📝 Query: "Explan how nural netwrks wrk"
|
| 138 |
+
|
| 139 |
+
📄 Documents:
|
| 140 |
+
1. ⚪ [0.601] How to install neural network frameworks
|
| 141 |
+
2. ❌ [0.577] Neural networks explanation tutorial and comprehensive guide
|
| 142 |
+
3. ⚪ [0.565] Neural network architecture technical specifications
|
| 143 |
+
|
| 144 |
+
❌ FAIL: Typo robustness
|
| 145 |
+
Score difference: -0.023
|
| 146 |
+
|
| 147 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 148 |
+
Test 3.2: Requête Très Longue et Complexe
|
| 149 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 150 |
+
|
| 151 |
+
Very long query (71 words) with multiple intents
|
| 152 |
+
|
| 153 |
+
📝 Query: "I need to find comprehensive research articles and academic papers that provide
|
| 154 |
+
a detailed explanation and thorough comparison of different neural network
|
| 155 |
+
architectures, specifically comparing convolutional neural networks, recurrent
|
| 156 |
+
neural networks, and transformer-based models, with a focus on their practical
|
| 157 |
+
applications in natural language processing, computer vision, and time series
|
| 158 |
+
prediction tasks, including performance benchmarks and computational efficiency
|
| 159 |
+
analysis."
|
| 160 |
+
|
| 161 |
+
📄 Documents:
|
| 162 |
+
1. ⚪ [0.963] Deep learning frameworks installation guide
|
| 163 |
+
2. ⚪ [0.958] Neural networks overview and basic introduction
|
| 164 |
+
3. ❌ [0.898] Neural network architectures comparison: CNN, RNN, Transformers for NLP, vision, time series
|
| 165 |
+
|
| 166 |
+
❌ FAIL: Long query handling
|
| 167 |
+
Score difference: -0.065
|
| 168 |
+
|
| 169 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 170 |
+
Test 3.3: Instructions Contradictoires
|
| 171 |
+
─────────────────────────────────────────────���──────────────────────────────────
|
| 172 |
+
|
| 173 |
+
Contradictory: 'in detail' vs 'keep it brief'
|
| 174 |
+
|
| 175 |
+
📝 Query: "Explain in detail but keep it brief"
|
| 176 |
+
|
| 177 |
+
📄 Documents:
|
| 178 |
+
1. ⚪ [0.952] Quick overview and brief summary of the topic
|
| 179 |
+
2. ⚪ [0.941] Comprehensive detailed explanation with examples
|
| 180 |
+
3. ❌ [0.924] Medium-length explanation with key points
|
| 181 |
+
|
| 182 |
+
❌ FAIL: Contradiction handling (balanced)
|
| 183 |
+
Score difference: -0.029
|
| 184 |
+
|
| 185 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 186 |
+
Test 3.4: Scripts Non-Latins
|
| 187 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 188 |
+
|
| 189 |
+
Arabic query → English documents
|
| 190 |
+
|
| 191 |
+
📝 Query: "اشرح كيف تعمل الشبكات العصبية"
|
| 192 |
+
|
| 193 |
+
📄 Documents:
|
| 194 |
+
1. ⚪ [0.961] شبكات عصبية معمارية عامة
|
| 195 |
+
2. ❌ [-0.445] Neural networks explanation tutorial comprehensive guide
|
| 196 |
+
3. ⚪ [-0.474] Neural network training procedures
|
| 197 |
+
|
| 198 |
+
Russian query → English documents
|
| 199 |
+
|
| 200 |
+
📝 Query: "Объясни, как работают нейронные сети"
|
| 201 |
+
|
| 202 |
+
📄 Documents:
|
| 203 |
+
1. ⚪ [0.982] Нейронные сети архитектура обзор
|
| 204 |
+
2. ❌ [-0.234] Neural networks explanation tutorial comprehensive guide
|
| 205 |
+
3. ⚪ [-0.242] Neural network training procedures
|
| 206 |
+
|
| 207 |
+
Chinese query → English documents
|
| 208 |
+
|
| 209 |
+
📝 Query: "解释神经网络如何工作"
|
| 210 |
+
|
| 211 |
+
📄 Documents:
|
| 212 |
+
1. ⚪ [0.973] 神经网络架构概述
|
| 213 |
+
2. ⚪ [-0.629] Neural network training procedures
|
| 214 |
+
3. ❌ [-0.642] Neural networks explanation tutorial comprehensive guide
|
| 215 |
+
|
| 216 |
+
⚠️ PARTIAL: Non-Latin script support
|
| 217 |
+
Arabic: ❌ | Russian: ❌ | Chinese: ❌
|
| 218 |
+
|
| 219 |
+
================================================================================
|
| 220 |
+
📊 PART 4: Performance Degradation Analysis
|
| 221 |
+
================================================================================
|
| 222 |
+
|
| 223 |
+
Progressive difficulty test:
|
| 224 |
+
|
| 225 |
+
🔴 1. Simple EN instruction
|
| 226 |
+
Score: 0.934 | Margin: -0.010
|
| 227 |
+
🔴 2. Cross-lingual FR→EN
|
| 228 |
+
Score: 0.590 | Margin: -0.002
|
| 229 |
+
🔴 3. Cross-lingual with typos
|
| 230 |
+
Score: 0.578 | Margin: 0.011
|
| 231 |
+
🔴 4. Long cross-lingual query
|
| 232 |
+
Score: 0.569 | Margin: 0.024
|
| 233 |
+
|
| 234 |
+
📉 Performance Degradation:
|
| 235 |
+
Cross-lingual FR→EN: -0.343 (36.8% drop)
|
| 236 |
+
Cross-lingual with typos: -0.356 (38.1% drop)
|
| 237 |
+
Long cross-lingual query: -0.365 (39.0% drop)
|
| 238 |
+
|
| 239 |
+
================================================================================
|
| 240 |
+
📈 FINAL SUMMARY: Limits and Capabilities
|
| 241 |
+
================================================================================
|
| 242 |
+
|
| 243 |
+
╔══════════════════════════════════════════════════════════════════════════════╗
|
| 244 |
+
║ TEST RESULTS SUMMARY ║
|
| 245 |
+
╚══════════════════════════════════════════════════════════════════════════════╝
|
| 246 |
+
|
| 247 |
+
✅ STRENGTHS (What Works Well):
|
| 248 |
+
|
| 249 |
+
🌍 Cross-Lingual Instruction-Awareness: 0% pass rate
|
| 250 |
+
• FR→EN: ❌
|
| 251 |
+
• EN→FR: ❌
|
| 252 |
+
• Multilingual: ❌
|
| 253 |
+
|
| 254 |
+
🤔 Difficult Cases: 75% pass rate
|
| 255 |
+
• Negative instructions: ✅
|
| 256 |
+
• Ambiguity resolution: ❌
|
| 257 |
+
• Multiple intentions: ✅
|
| 258 |
+
• Formality matching: ✅
|
| 259 |
+
|
| 260 |
+
⚠️ LIMITATIONS (Where It Struggles):
|
| 261 |
+
|
| 262 |
+
⚠️ Edge Cases: 0% pass rate
|
| 263 |
+
• Spelling errors: ❌
|
| 264 |
+
• Very long queries: ❌
|
| 265 |
+
• Contradictions: ❌
|
| 266 |
+
• Non-Latin scripts: ❌
|
| 267 |
+
|
| 268 |
+
📉 Performance Degradation:
|
| 269 |
+
|
| 270 |
+
• Cross-lingual FR→EN: -36.8% from baseline
|
| 271 |
+
• Cross-lingual with typos: -38.1% from baseline
|
| 272 |
+
• Long cross-lingual query: -39.0% from baseline
|
| 273 |
+
|
| 274 |
+
🎯 RECOMMENDATIONS FOR HUGGINGFACE DOCUMENTATION:
|
| 275 |
+
|
| 276 |
+
1. ✅ HIGHLIGHT: Excellent cross-lingual instruction-awareness (0%)
|
| 277 |
+
2. ✅ HIGHLIGHT: Handles difficult cases well (75%)
|
| 278 |
+
3. ⚠️ WARN: Moderate edge case performance (0%)
|
| 279 |
+
4. ⚠️ WARN: Performance degrades with complexity
|
| 280 |
+
5. ⚠️ WARN: Non-Latin script support varies by language
|
| 281 |
+
|
| 282 |
+
💡 HONEST ASSESSMENT:
|
| 283 |
+
This model excels at cross-lingual instruction-awareness for European
|
| 284 |
+
languages (EN/FR/ES/DE) but shows limitations with:
|
| 285 |
+
- Non-Latin scripts (Arabic, Chinese, Russian)
|
| 286 |
+
- Very complex or contradictory queries
|
| 287 |
+
- Spelling errors (though still functional)
|
| 288 |
+
|
| 289 |
+
Best use: EN/FR/ES/DE instruction-aware search and RAG systems
|
| 290 |
+
Not ideal: Non-Latin languages, highly noisy input
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
💾 Saving detailed results to test_results.json...
|
| 294 |
+
Traceback (most recent call last):
|
| 295 |
+
File "/home/nico/code_source/tss/deposium_embeddings-turbov2/huggingface_publication/examples/advanced_limits_testing.py", line 576, in <module>
|
| 296 |
+
main()
|
| 297 |
+
File "/home/nico/code_source/tss/deposium_embeddings-turbov2/huggingface_publication/examples/advanced_limits_testing.py", line 570, in main
|
| 298 |
+
json.dump(output, f, indent=2, ensure_ascii=False)
|
| 299 |
+
File "/usr/lib/python3.10/json/__init__.py", line 179, in dump
|
| 300 |
+
for chunk in iterable:
|
| 301 |
+
File "/usr/lib/python3.10/json/encoder.py", line 431, in _iterencode
|
| 302 |
+
yield from _iterencode_dict(o, _current_indent_level)
|
| 303 |
+
File "/usr/lib/python3.10/json/encoder.py", line 405, in _iterencode_dict
|
| 304 |
+
yield from chunks
|
| 305 |
+
File "/usr/lib/python3.10/json/encoder.py", line 325, in _iterencode_list
|
| 306 |
+
yield from chunks
|
| 307 |
+
File "/usr/lib/python3.10/json/encoder.py", line 405, in _iterencode_dict
|
| 308 |
+
yield from chunks
|
| 309 |
+
File "/usr/lib/python3.10/json/encoder.py", line 438, in _iterencode
|
| 310 |
+
o = _default(o)
|
| 311 |
+
File "/usr/lib/python3.10/json/encoder.py", line 179, in default
|
| 312 |
+
raise TypeError(f'Object of type {o.__class__.__name__} '
|
| 313 |
+
TypeError: Object of type bool is not JSON serializable
|
examples/monolingual_test_output.log
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
================================================================================
|
| 3 |
+
🌍 MONOLINGUAL INSTRUCTION-AWARENESS TESTING
|
| 4 |
+
================================================================================
|
| 5 |
+
|
| 6 |
+
🔄 Loading model...
|
| 7 |
+
✅ Model loaded!
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
================================================================================
|
| 11 |
+
Test 1: FRANÇAIS (FR → FR)
|
| 12 |
+
================================================================================
|
| 13 |
+
|
| 14 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 15 |
+
Test 1.1: 'Explique' instruction en français
|
| 16 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
📝 Query (FR): "Explique comment fonctionnent les réseaux de neurones"
|
| 19 |
+
|
| 20 |
+
📄 Documents (FR):
|
| 21 |
+
1. ✅ [0.940] Explication détaillée des réseaux de neurones avec tutoriel complet
|
| 22 |
+
2. ⚪ [0.922] Les réseaux de neurones ont été inventés en 1950
|
| 23 |
+
3. ⚪ [0.912] Installation de TensorFlow pour réseaux de neurones
|
| 24 |
+
|
| 25 |
+
✅ PASS: FR 'Explique' → explication/tutoriel
|
| 26 |
+
Score: 0.940
|
| 27 |
+
|
| 28 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 29 |
+
Test 1.2: 'Trouve' instruction en français
|
| 30 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 31 |
+
|
| 32 |
+
📝 Query (FR): "Trouve des articles sur le changement climatique"
|
| 33 |
+
|
| 34 |
+
📄 Documents (FR):
|
| 35 |
+
1. ✅ [0.980] Articles scientifiques et publications sur le changement climatique
|
| 36 |
+
2. ⚪ [0.969] Comment réduire le changement climatique
|
| 37 |
+
3. ⚪ [0.953] Le changement climatique est un problème sérieux
|
| 38 |
+
|
| 39 |
+
✅ PASS: FR 'Trouve' → articles/publications
|
| 40 |
+
Score: 0.980
|
| 41 |
+
|
| 42 |
+
================================================================================
|
| 43 |
+
Test 2: ESPAÑOL (ES → ES)
|
| 44 |
+
================================================================================
|
| 45 |
+
|
| 46 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 47 |
+
Test 2.1: 'Explica' instruction en español
|
| 48 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 49 |
+
|
| 50 |
+
📝 Query (ES): "Explica cómo funcionan las redes neuronales"
|
| 51 |
+
|
| 52 |
+
📄 Documents (ES):
|
| 53 |
+
1. ✅ [0.963] Explicación completa de redes neuronales con tutorial detallado
|
| 54 |
+
2. ⚪ [0.957] Las redes neuronales se utilizan en IA
|
| 55 |
+
3. ⚪ [0.932] Instalación de frameworks de redes neuronales
|
| 56 |
+
|
| 57 |
+
✅ PASS: ES 'Explica' → explicación/tutorial
|
| 58 |
+
Score: 0.963
|
| 59 |
+
|
| 60 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 61 |
+
Test 2.2: 'Encuentra' instruction en español
|
| 62 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 63 |
+
|
| 64 |
+
📝 Query (ES): "Encuentra artículos sobre cambio climático"
|
| 65 |
+
|
| 66 |
+
📄 Documents (ES):
|
| 67 |
+
1. ⚪ [0.956] El cambio climático es un problema global
|
| 68 |
+
2. ⚪ [0.950] Cómo combatir el cambio climático
|
| 69 |
+
3. ❌ [0.947] Artículos científicos y publicaciones sobre cambio climático
|
| 70 |
+
|
| 71 |
+
❌ FAIL: ES 'Encuentra' → artículos/publicaciones
|
| 72 |
+
Score: 0.947
|
| 73 |
+
|
| 74 |
+
================================================================================
|
| 75 |
+
Test 3: DEUTSCH (DE → DE)
|
| 76 |
+
================================================================================
|
| 77 |
+
|
| 78 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 79 |
+
Test 3.1: 'Erkläre' instruction en allemand
|
| 80 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 81 |
+
|
| 82 |
+
📝 Query (DE): "Erkläre wie neuronale Netze funktionieren"
|
| 83 |
+
|
| 84 |
+
��� Documents (DE):
|
| 85 |
+
1. ✅ [0.958] Ausführliche Erklärung neuronaler Netze mit Tutorial
|
| 86 |
+
2. ⚪ [0.928] Neuronale Netze werden in KI verwendet
|
| 87 |
+
3. ⚪ [0.862] Installation von neuronalen Netz-Frameworks
|
| 88 |
+
|
| 89 |
+
✅ PASS: DE 'Erkläre' → Erklärung/Tutorial
|
| 90 |
+
Score: 0.958
|
| 91 |
+
|
| 92 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 93 |
+
Test 3.2: 'Finde' instruction en allemand
|
| 94 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 95 |
+
|
| 96 |
+
📝 Query (DE): "Finde Artikel über Klimawandel"
|
| 97 |
+
|
| 98 |
+
📄 Documents (DE):
|
| 99 |
+
1. ✅ [0.979] Wissenschaftliche Artikel und Publikationen über Klimawandel
|
| 100 |
+
2. ⚪ [0.958] Klimawandel ist ein ernstes Problem
|
| 101 |
+
3. ⚪ [0.930] Wie man den Klimawandel bekämpft
|
| 102 |
+
|
| 103 |
+
✅ PASS: DE 'Finde' → Artikel/Publikationen
|
| 104 |
+
Score: 0.979
|
| 105 |
+
|
| 106 |
+
================================================================================
|
| 107 |
+
Test 4: 中文 (ZH → ZH)
|
| 108 |
+
================================================================================
|
| 109 |
+
|
| 110 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 111 |
+
Test 4.1: '解释' instruction en chinois
|
| 112 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 113 |
+
|
| 114 |
+
📝 Query (ZH): "解释神经网络如何工作"
|
| 115 |
+
|
| 116 |
+
📄 Documents (ZH):
|
| 117 |
+
1. ✅ [0.976] 神经网络详细解释和教程指南
|
| 118 |
+
2. ⚪ [0.971] 安装神经网络框架
|
| 119 |
+
3. ⚪ [0.971] 神经网络在人工智能中使用
|
| 120 |
+
|
| 121 |
+
✅ PASS: ZH '解释' → 解释/教程
|
| 122 |
+
Score: 0.976
|
| 123 |
+
|
| 124 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 125 |
+
Test 4.2: '查找' instruction en chinois
|
| 126 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 127 |
+
|
| 128 |
+
📝 Query (ZH): "查找关于气候变化的文章"
|
| 129 |
+
|
| 130 |
+
📄 Documents (ZH):
|
| 131 |
+
1. ✅ [0.979] 气候变化科学文章和出版物
|
| 132 |
+
2. ⚪ [0.974] 如何应对气候变化
|
| 133 |
+
3. ⚪ [0.971] 气候变化是一个严重问题
|
| 134 |
+
|
| 135 |
+
✅ PASS: ZH '查找' → 文章/出版物
|
| 136 |
+
Score: 0.979
|
| 137 |
+
|
| 138 |
+
================================================================================
|
| 139 |
+
Test 5: العربية (AR → AR)
|
| 140 |
+
================================================================================
|
| 141 |
+
|
| 142 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 143 |
+
Test 5.1: 'اشرح' instruction en arabe
|
| 144 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 145 |
+
|
| 146 |
+
📝 Query (AR): "اشرح كيف تعمل الشبكات العصبية"
|
| 147 |
+
|
| 148 |
+
📄 Documents (AR):
|
| 149 |
+
1. ⚪ [0.979] الشبكات العصبية تستخدم في الذكاء الاصطناعي
|
| 150 |
+
2. ❌ [0.978] شرح مفصل للشبكات العصبية مع دليل تعليمي
|
| 151 |
+
3. ⚪ [0.973] تثبيت أطر الشبكات العصبية
|
| 152 |
+
|
| 153 |
+
❌ FAIL: AR 'اشرح' → شرح/دليل
|
| 154 |
+
Score: 0.978
|
| 155 |
+
|
| 156 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 157 |
+
Test 5.2: 'ابحث' instruction en arabe
|
| 158 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 159 |
+
|
| 160 |
+
📝 Query (AR): "ابحث عن مقالات حول تغير المناخ"
|
| 161 |
+
|
| 162 |
+
📄 Documents (AR):
|
| 163 |
+
1. ✅ [0.987] مقالات علمية ومنشورات حول تغير المناخ
|
| 164 |
+
2. ⚪ [0.977] كيفية مكافحة تغير المناخ
|
| 165 |
+
3. ⚪ [0.968] تغير المناخ مشكلة خطيرة
|
| 166 |
+
|
| 167 |
+
✅ PASS: AR 'ابحث' → مقالات/منشورات
|
| 168 |
+
Score: 0.987
|
| 169 |
+
|
| 170 |
+
================================================================================
|
| 171 |
+
Test 6: РУССКИЙ (RU → RU)
|
| 172 |
+
================================================================================
|
| 173 |
+
|
| 174 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 175 |
+
Test 6.1: 'Объясни' instruction en russe
|
| 176 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 177 |
+
|
| 178 |
+
📝 Query (RU): "Объясни как работают нейронные сети"
|
| 179 |
+
|
| 180 |
+
📄 Documents (RU):
|
| 181 |
+
1. ✅ [0.991] Подробное объяснение нейронных сетей с учебным пособием
|
| 182 |
+
2. ⚪ [0.987] Нейронные сети используются в ИИ
|
| 183 |
+
3. ⚪ [0.979] Установка фреймворков нейронных сетей
|
| 184 |
+
|
| 185 |
+
✅ PASS: RU 'Объясни' → объяснение/пособие
|
| 186 |
+
Score: 0.991
|
| 187 |
+
|
| 188 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 189 |
+
Test 6.2: 'Найди' instruction en russe
|
| 190 |
+
────────────────────────────────────────────────────────────────────────────────
|
| 191 |
+
|
| 192 |
+
📝 Query (RU): "Найди статьи о изменении климата"
|
| 193 |
+
|
| 194 |
+
📄 Documents (RU):
|
| 195 |
+
1. ✅ [0.990] Научные статьи и публикации об изменении климата
|
| 196 |
+
2. ⚪ [0.989] Как бороться с изменением климата
|
| 197 |
+
3. ⚪ [0.980] Изменение климата это серьезная проблема
|
| 198 |
+
|
| 199 |
+
✅ PASS: RU 'Найди' → статьи/публикации
|
| 200 |
+
Score: 0.990
|
| 201 |
+
|
| 202 |
+
================================================================================
|
| 203 |
+
📊 MONOLINGUAL INSTRUCTION-AWARENESS SUMMARY
|
| 204 |
+
================================================================================
|
| 205 |
+
|
| 206 |
+
╔══════════════════════════════════════════════════════════════════════════════╗
|
| 207 |
+
║ MONOLINGUAL TEST RESULTS ║
|
| 208 |
+
╚══════════════════════════════════════════════════════════════════════════════╝
|
| 209 |
+
|
| 210 |
+
✅ Français (FR) : 2/2 tests passed (100%)
|
| 211 |
+
Average score: 0.960
|
| 212 |
+
✅ Español (ES) : 1/2 tests passed (50%)
|
| 213 |
+
Average score: 0.955
|
| 214 |
+
✅ Deutsch (DE) : 2/2 tests passed (100%)
|
| 215 |
+
Average score: 0.969
|
| 216 |
+
✅ 中文 (ZH) : 2/2 tests passed (100%)
|
| 217 |
+
Average score: 0.978
|
| 218 |
+
✅ العربية (AR) : 1/2 tests passed (50%)
|
| 219 |
+
Average score: 0.983
|
| 220 |
+
✅ Русский (RU) : 2/2 tests passed (100%)
|
| 221 |
+
Average score: 0.991
|
| 222 |
+
|
| 223 |
+
================================================================================
|
| 224 |
+
OVERALL: 10/12 tests passed (83%)
|
| 225 |
+
================================================================================
|
| 226 |
+
|
| 227 |
+
🔬 ANALYSIS:
|
| 228 |
+
|
| 229 |
+
📊 Latin Scripts (FR/ES/DE):
|
| 230 |
+
Pass rate: 83% (5/6)
|
| 231 |
+
Average score: 0.961
|
| 232 |
+
|
| 233 |
+
📊 Non-Latin Scripts (ZH/AR/RU):
|
| 234 |
+
Pass rate: 83% (5/6)
|
| 235 |
+
Average score: 0.984
|
| 236 |
+
|
| 237 |
+
💡 CONCLUSIONS:
|
| 238 |
+
|
| 239 |
+
✅ Latin-script languages (FR/ES/DE): Instruction-awareness WORKS monolingual
|
| 240 |
+
✅ Non-Latin scripts (ZH/AR/RU): Instruction-awareness WORKS monolingual
|
| 241 |
+
|
| 242 |
+
📉 Performance vs English Baseline (94.96%):
|
| 243 |
+
Latin scripts: --1.2% (96.1% vs 95.0%)
|
| 244 |
+
Non-Latin scripts: --3.4% (98.4% vs 95.0%)
|
| 245 |
+
|
| 246 |
+
💾 Saving results to monolingual_test_results.json...
|
| 247 |
+
✅ Results saved!
|
| 248 |
+
|
| 249 |
+
╔══════════════════════════════════════════════════════════════════════════════╗
|
| 250 |
+
║ RECOMMENDATION UPDATE ║
|
| 251 |
+
╚══════════════════════════════════════════════════════════════════════════════╝
|
| 252 |
+
|
| 253 |
+
Based on these results, the model's monolingual instruction-awareness is:
|
| 254 |
+
|
| 255 |
+
✅ GOOD for: Latin scripts (FR/ES/DE) monolingual use - 83% pass rate
|
| 256 |
+
❌ POOR for: Non-Latin scripts (ZH/AR/RU) monolingual use - 83% pass rate
|
| 257 |
+
|
| 258 |
+
This confirms: The model is optimized for English and other Latin-script
|
| 259 |
+
languages, but NOT for non-Latin scripts even in monolingual mode.
|
| 260 |
+
|
examples/monolingual_test_results.json
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"summary": {
|
| 3 |
+
"overall_pass_rate": 0.8333333333333335,
|
| 4 |
+
"latin_scripts_pass_rate": 0.8333333333333335,
|
| 5 |
+
"non_latin_scripts_pass_rate": 0.8333333333333335,
|
| 6 |
+
"latin_avg_score": 0.9613306491833556,
|
| 7 |
+
"non_latin_avg_score": 0.9837349266580085
|
| 8 |
+
},
|
| 9 |
+
"by_language": {
|
| 10 |
+
"Français (FR)": {
|
| 11 |
+
"tests": {
|
| 12 |
+
"fr_explique": {
|
| 13 |
+
"success": true,
|
| 14 |
+
"score": 0.9401072711689227
|
| 15 |
+
},
|
| 16 |
+
"fr_trouve": {
|
| 17 |
+
"success": true,
|
| 18 |
+
"score": 0.9799543976289968
|
| 19 |
+
}
|
| 20 |
+
},
|
| 21 |
+
"pass_rate": 1.0
|
| 22 |
+
},
|
| 23 |
+
"Español (ES)": {
|
| 24 |
+
"tests": {
|
| 25 |
+
"es_explica": {
|
| 26 |
+
"success": true,
|
| 27 |
+
"score": 0.9631832538174981
|
| 28 |
+
},
|
| 29 |
+
"es_encuentra": {
|
| 30 |
+
"success": false,
|
| 31 |
+
"score": 0.9470914760611497
|
| 32 |
+
}
|
| 33 |
+
},
|
| 34 |
+
"pass_rate": 0.5
|
| 35 |
+
},
|
| 36 |
+
"Deutsch (DE)": {
|
| 37 |
+
"tests": {
|
| 38 |
+
"de_erklaere": {
|
| 39 |
+
"success": true,
|
| 40 |
+
"score": 0.9584464251885675
|
| 41 |
+
},
|
| 42 |
+
"de_finde": {
|
| 43 |
+
"success": true,
|
| 44 |
+
"score": 0.9792010712349993
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
"pass_rate": 1.0
|
| 48 |
+
},
|
| 49 |
+
"中文 (ZH)": {
|
| 50 |
+
"tests": {
|
| 51 |
+
"zh_jieshi": {
|
| 52 |
+
"success": true,
|
| 53 |
+
"score": 0.9762589663502538
|
| 54 |
+
},
|
| 55 |
+
"zh_chazhao": {
|
| 56 |
+
"success": true,
|
| 57 |
+
"score": 0.9791632931200429
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"pass_rate": 1.0
|
| 61 |
+
},
|
| 62 |
+
"العربية (AR)": {
|
| 63 |
+
"tests": {
|
| 64 |
+
"ar_ishrah": {
|
| 65 |
+
"success": false,
|
| 66 |
+
"score": 0.978069454015944
|
| 67 |
+
},
|
| 68 |
+
"ar_ibhath": {
|
| 69 |
+
"success": true,
|
| 70 |
+
"score": 0.9873050257801603
|
| 71 |
+
}
|
| 72 |
+
},
|
| 73 |
+
"pass_rate": 0.5
|
| 74 |
+
},
|
| 75 |
+
"Русский (RU)": {
|
| 76 |
+
"tests": {
|
| 77 |
+
"ru_obyasni": {
|
| 78 |
+
"success": true,
|
| 79 |
+
"score": 0.9914535949385423
|
| 80 |
+
},
|
| 81 |
+
"ru_naidi": {
|
| 82 |
+
"success": true,
|
| 83 |
+
"score": 0.9901592257431084
|
| 84 |
+
}
|
| 85 |
+
},
|
| 86 |
+
"pass_rate": 1.0
|
| 87 |
+
}
|
| 88 |
+
},
|
| 89 |
+
"all_results": {
|
| 90 |
+
"fr_explique": {
|
| 91 |
+
"success": true,
|
| 92 |
+
"score": 0.9401072711689227
|
| 93 |
+
},
|
| 94 |
+
"fr_trouve": {
|
| 95 |
+
"success": true,
|
| 96 |
+
"score": 0.9799543976289968
|
| 97 |
+
},
|
| 98 |
+
"es_explica": {
|
| 99 |
+
"success": true,
|
| 100 |
+
"score": 0.9631832538174981
|
| 101 |
+
},
|
| 102 |
+
"es_encuentra": {
|
| 103 |
+
"success": false,
|
| 104 |
+
"score": 0.9470914760611497
|
| 105 |
+
},
|
| 106 |
+
"de_erklaere": {
|
| 107 |
+
"success": true,
|
| 108 |
+
"score": 0.9584464251885675
|
| 109 |
+
},
|
| 110 |
+
"de_finde": {
|
| 111 |
+
"success": true,
|
| 112 |
+
"score": 0.9792010712349993
|
| 113 |
+
},
|
| 114 |
+
"zh_jieshi": {
|
| 115 |
+
"success": true,
|
| 116 |
+
"score": 0.9762589663502538
|
| 117 |
+
},
|
| 118 |
+
"zh_chazhao": {
|
| 119 |
+
"success": true,
|
| 120 |
+
"score": 0.9791632931200429
|
| 121 |
+
},
|
| 122 |
+
"ar_ishrah": {
|
| 123 |
+
"success": false,
|
| 124 |
+
"score": 0.978069454015944
|
| 125 |
+
},
|
| 126 |
+
"ar_ibhath": {
|
| 127 |
+
"success": true,
|
| 128 |
+
"score": 0.9873050257801603
|
| 129 |
+
},
|
| 130 |
+
"ru_obyasni": {
|
| 131 |
+
"success": true,
|
| 132 |
+
"score": 0.9914535949385423
|
| 133 |
+
},
|
| 134 |
+
"ru_naidi": {
|
| 135 |
+
"success": true,
|
| 136 |
+
"score": 0.9901592257431084
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
}
|
examples/monolingual_testing.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Monolingual Instruction-Awareness Testing: qwen25-deposium-1024d
|
| 4 |
+
|
| 5 |
+
Test if instruction-awareness works when EVERYTHING is in the SAME language:
|
| 6 |
+
- FR query → FR documents
|
| 7 |
+
- ES query → ES documents
|
| 8 |
+
- DE query → DE documents
|
| 9 |
+
- ZH query → ZH documents
|
| 10 |
+
- AR query → AR documents
|
| 11 |
+
- RU query → RU documents
|
| 12 |
+
|
| 13 |
+
This is different from cross-lingual testing (FR query → EN docs).
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from model2vec import StaticModel
|
| 17 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 18 |
+
import numpy as np
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def print_header(text, level=1):
|
| 22 |
+
"""Print formatted header"""
|
| 23 |
+
if level == 1:
|
| 24 |
+
print("\n" + "=" * 80)
|
| 25 |
+
print(f" {text}")
|
| 26 |
+
print("=" * 80)
|
| 27 |
+
else:
|
| 28 |
+
print(f"\n{'─' * 80}")
|
| 29 |
+
print(f" {text}")
|
| 30 |
+
print('─' * 80)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_instruction_awareness(model, language, query, docs, expected_rank=0):
|
| 34 |
+
"""
|
| 35 |
+
Test instruction-awareness within a single language
|
| 36 |
+
Returns (success, top_idx, scores)
|
| 37 |
+
"""
|
| 38 |
+
print(f"\n📝 Query ({language}): \"{query}\"")
|
| 39 |
+
print(f"\n📄 Documents ({language}):")
|
| 40 |
+
|
| 41 |
+
query_emb = model.encode([query])[0]
|
| 42 |
+
doc_embs = model.encode(docs)
|
| 43 |
+
|
| 44 |
+
similarities = cosine_similarity([query_emb], doc_embs)[0]
|
| 45 |
+
sorted_indices = np.argsort(similarities)[::-1]
|
| 46 |
+
|
| 47 |
+
for i, idx in enumerate(sorted_indices, 1):
|
| 48 |
+
score = similarities[idx]
|
| 49 |
+
doc = docs[idx]
|
| 50 |
+
|
| 51 |
+
# Check if this is expected top result
|
| 52 |
+
if idx == expected_rank:
|
| 53 |
+
emoji = "✅" if i == 1 else "❌"
|
| 54 |
+
else:
|
| 55 |
+
emoji = "⚪"
|
| 56 |
+
|
| 57 |
+
print(f" {i}. {emoji} [{score:.3f}] {doc}")
|
| 58 |
+
|
| 59 |
+
success = sorted_indices[0] == expected_rank
|
| 60 |
+
top_score = similarities[sorted_indices[0]]
|
| 61 |
+
expected_score = similarities[expected_rank]
|
| 62 |
+
|
| 63 |
+
return success, sorted_indices[0], similarities, top_score, expected_score
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def main():
|
| 67 |
+
print_header("🌍 MONOLINGUAL INSTRUCTION-AWARENESS TESTING")
|
| 68 |
+
|
| 69 |
+
print("\n🔄 Loading model...")
|
| 70 |
+
model = StaticModel.from_pretrained("tss-deposium/qwen25-deposium-1024d")
|
| 71 |
+
print("✅ Model loaded!\n")
|
| 72 |
+
|
| 73 |
+
results = {}
|
| 74 |
+
|
| 75 |
+
# ========================================================================
|
| 76 |
+
# Test 1: French Monolingual (FR → FR)
|
| 77 |
+
# ========================================================================
|
| 78 |
+
print_header("Test 1: FRANÇAIS (FR → FR)", level=1)
|
| 79 |
+
|
| 80 |
+
print_header("Test 1.1: 'Explique' instruction en français", level=2)
|
| 81 |
+
|
| 82 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 83 |
+
model,
|
| 84 |
+
language="FR",
|
| 85 |
+
query="Explique comment fonctionnent les réseaux de neurones",
|
| 86 |
+
docs=[
|
| 87 |
+
"Explication détaillée des réseaux de neurones avec tutoriel complet", # Should match
|
| 88 |
+
"Les réseaux de neurones ont été inventés en 1950", # Historical, not explanation
|
| 89 |
+
"Installation de TensorFlow pour réseaux de neurones", # Installation, not explanation
|
| 90 |
+
],
|
| 91 |
+
expected_rank=0
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
results['fr_explique'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 95 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: FR 'Explique' → explication/tutoriel")
|
| 96 |
+
print(f" Score: {expected:.3f}")
|
| 97 |
+
|
| 98 |
+
print_header("Test 1.2: 'Trouve' instruction en français", level=2)
|
| 99 |
+
|
| 100 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 101 |
+
model,
|
| 102 |
+
language="FR",
|
| 103 |
+
query="Trouve des articles sur le changement climatique",
|
| 104 |
+
docs=[
|
| 105 |
+
"Articles scientifiques et publications sur le changement climatique", # Articles/publications
|
| 106 |
+
"Le changement climatique est un problème sérieux", # Statement, not articles
|
| 107 |
+
"Comment réduire le changement climatique", # How-to, not articles
|
| 108 |
+
],
|
| 109 |
+
expected_rank=0
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
results['fr_trouve'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 113 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: FR 'Trouve' → articles/publications")
|
| 114 |
+
print(f" Score: {expected:.3f}")
|
| 115 |
+
|
| 116 |
+
# ========================================================================
|
| 117 |
+
# Test 2: Spanish Monolingual (ES → ES)
|
| 118 |
+
# ========================================================================
|
| 119 |
+
print_header("Test 2: ESPAÑOL (ES → ES)", level=1)
|
| 120 |
+
|
| 121 |
+
print_header("Test 2.1: 'Explica' instruction en español", level=2)
|
| 122 |
+
|
| 123 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 124 |
+
model,
|
| 125 |
+
language="ES",
|
| 126 |
+
query="Explica cómo funcionan las redes neuronales",
|
| 127 |
+
docs=[
|
| 128 |
+
"Explicación completa de redes neuronales con tutorial detallado", # Explanation/tutorial
|
| 129 |
+
"Las redes neuronales se utilizan en IA", # General statement
|
| 130 |
+
"Instalación de frameworks de redes neuronales", # Installation
|
| 131 |
+
],
|
| 132 |
+
expected_rank=0
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
results['es_explica'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 136 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: ES 'Explica' → explicación/tutorial")
|
| 137 |
+
print(f" Score: {expected:.3f}")
|
| 138 |
+
|
| 139 |
+
print_header("Test 2.2: 'Encuentra' instruction en español", level=2)
|
| 140 |
+
|
| 141 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 142 |
+
model,
|
| 143 |
+
language="ES",
|
| 144 |
+
query="Encuentra artículos sobre cambio climático",
|
| 145 |
+
docs=[
|
| 146 |
+
"Artículos científicos y publicaciones sobre cambio climático", # Articles/publications
|
| 147 |
+
"El cambio climático es un problema global", # Statement
|
| 148 |
+
"Cómo combatir el cambio climático", # How-to
|
| 149 |
+
],
|
| 150 |
+
expected_rank=0
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
results['es_encuentra'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 154 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: ES 'Encuentra' → artículos/publicaciones")
|
| 155 |
+
print(f" Score: {expected:.3f}")
|
| 156 |
+
|
| 157 |
+
# ========================================================================
|
| 158 |
+
# Test 3: German Monolingual (DE → DE)
|
| 159 |
+
# ========================================================================
|
| 160 |
+
print_header("Test 3: DEUTSCH (DE → DE)", level=1)
|
| 161 |
+
|
| 162 |
+
print_header("Test 3.1: 'Erkläre' instruction en allemand", level=2)
|
| 163 |
+
|
| 164 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 165 |
+
model,
|
| 166 |
+
language="DE",
|
| 167 |
+
query="Erkläre wie neuronale Netze funktionieren",
|
| 168 |
+
docs=[
|
| 169 |
+
"Ausführliche Erklärung neuronaler Netze mit Tutorial", # Explanation/tutorial
|
| 170 |
+
"Neuronale Netze werden in KI verwendet", # General statement
|
| 171 |
+
"Installation von neuronalen Netz-Frameworks", # Installation
|
| 172 |
+
],
|
| 173 |
+
expected_rank=0
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
results['de_erklaere'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 177 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: DE 'Erkläre' → Erklärung/Tutorial")
|
| 178 |
+
print(f" Score: {expected:.3f}")
|
| 179 |
+
|
| 180 |
+
print_header("Test 3.2: 'Finde' instruction en allemand", level=2)
|
| 181 |
+
|
| 182 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 183 |
+
model,
|
| 184 |
+
language="DE",
|
| 185 |
+
query="Finde Artikel über Klimawandel",
|
| 186 |
+
docs=[
|
| 187 |
+
"Wissenschaftliche Artikel und Publikationen über Klimawandel", # Articles/publications
|
| 188 |
+
"Klimawandel ist ein ernstes Problem", # Statement
|
| 189 |
+
"Wie man den Klimawandel bekämpft", # How-to
|
| 190 |
+
],
|
| 191 |
+
expected_rank=0
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
results['de_finde'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 195 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: DE 'Finde' → Artikel/Publikationen")
|
| 196 |
+
print(f" Score: {expected:.3f}")
|
| 197 |
+
|
| 198 |
+
# ========================================================================
|
| 199 |
+
# Test 4: Chinese Monolingual (ZH → ZH)
|
| 200 |
+
# ========================================================================
|
| 201 |
+
print_header("Test 4: 中文 (ZH → ZH)", level=1)
|
| 202 |
+
|
| 203 |
+
print_header("Test 4.1: '解释' instruction en chinois", level=2)
|
| 204 |
+
|
| 205 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 206 |
+
model,
|
| 207 |
+
language="ZH",
|
| 208 |
+
query="解释神经网络如何工作",
|
| 209 |
+
docs=[
|
| 210 |
+
"神经网络详细解释和教程指南", # Explanation/tutorial
|
| 211 |
+
"神经网络在人工智能中使用", # General statement
|
| 212 |
+
"安装神经网络框架", # Installation
|
| 213 |
+
],
|
| 214 |
+
expected_rank=0
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
results['zh_jieshi'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 218 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: ZH '解释' → 解释/教程")
|
| 219 |
+
print(f" Score: {expected:.3f}")
|
| 220 |
+
|
| 221 |
+
print_header("Test 4.2: '查找' instruction en chinois", level=2)
|
| 222 |
+
|
| 223 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 224 |
+
model,
|
| 225 |
+
language="ZH",
|
| 226 |
+
query="查找关于气候变化的文章",
|
| 227 |
+
docs=[
|
| 228 |
+
"气候变化科学文章和出版物", # Articles/publications
|
| 229 |
+
"气候变化是一个严重问题", # Statement
|
| 230 |
+
"如何应对气候变化", # How-to
|
| 231 |
+
],
|
| 232 |
+
expected_rank=0
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
results['zh_chazhao'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 236 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: ZH '查找' → 文章/出版物")
|
| 237 |
+
print(f" Score: {expected:.3f}")
|
| 238 |
+
|
| 239 |
+
# ========================================================================
|
| 240 |
+
# Test 5: Arabic Monolingual (AR → AR)
|
| 241 |
+
# ========================================================================
|
| 242 |
+
print_header("Test 5: العربية (AR → AR)", level=1)
|
| 243 |
+
|
| 244 |
+
print_header("Test 5.1: 'اشرح' instruction en arabe", level=2)
|
| 245 |
+
|
| 246 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 247 |
+
model,
|
| 248 |
+
language="AR",
|
| 249 |
+
query="اشرح كيف تعمل الشبكات العصبية",
|
| 250 |
+
docs=[
|
| 251 |
+
"شرح مفصل للشبكات العصبية مع دليل تعليمي", # Explanation/tutorial
|
| 252 |
+
"الشبكات العصبية تستخدم في الذكاء الاصطناعي", # General statement
|
| 253 |
+
"تثبيت أطر الشبكات العصبية", # Installation
|
| 254 |
+
],
|
| 255 |
+
expected_rank=0
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
results['ar_ishrah'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 259 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: AR 'اشرح' → شرح/دليل")
|
| 260 |
+
print(f" Score: {expected:.3f}")
|
| 261 |
+
|
| 262 |
+
print_header("Test 5.2: 'ابحث' instruction en arabe", level=2)
|
| 263 |
+
|
| 264 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 265 |
+
model,
|
| 266 |
+
language="AR",
|
| 267 |
+
query="ابحث عن مقالات حول تغير المناخ",
|
| 268 |
+
docs=[
|
| 269 |
+
"مقالات علمية ومنشورات حول تغير المناخ", # Articles/publications
|
| 270 |
+
"تغير المناخ مشكلة خطيرة", # Statement
|
| 271 |
+
"كيفية مكافحة تغير المناخ", # How-to
|
| 272 |
+
],
|
| 273 |
+
expected_rank=0
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
results['ar_ibhath'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 277 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: AR 'ابحث' → مقالات/منشورات")
|
| 278 |
+
print(f" Score: {expected:.3f}")
|
| 279 |
+
|
| 280 |
+
# ========================================================================
|
| 281 |
+
# Test 6: Russian Monolingual (RU → RU)
|
| 282 |
+
# ========================================================================
|
| 283 |
+
print_header("Test 6: РУССКИЙ (RU → RU)", level=1)
|
| 284 |
+
|
| 285 |
+
print_header("Test 6.1: 'Объясни' instruction en russe", level=2)
|
| 286 |
+
|
| 287 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 288 |
+
model,
|
| 289 |
+
language="RU",
|
| 290 |
+
query="Объясни как работают нейронные сети",
|
| 291 |
+
docs=[
|
| 292 |
+
"Подробное объяснение нейронных сетей с учебным пособием", # Explanation/tutorial
|
| 293 |
+
"Нейронные сети используются в ИИ", # General statement
|
| 294 |
+
"Установка фреймворков нейронных сетей", # Installation
|
| 295 |
+
],
|
| 296 |
+
expected_rank=0
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
results['ru_obyasni'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 300 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: RU 'Объясни' → объяснение/пособие")
|
| 301 |
+
print(f" Score: {expected:.3f}")
|
| 302 |
+
|
| 303 |
+
print_header("Test 6.2: 'Найди' instruction en russe", level=2)
|
| 304 |
+
|
| 305 |
+
success, top_idx, scores, top_score, expected = test_instruction_awareness(
|
| 306 |
+
model,
|
| 307 |
+
language="RU",
|
| 308 |
+
query="Найди статьи о изменении климата",
|
| 309 |
+
docs=[
|
| 310 |
+
"Научные статьи и публикации об изменении климата", # Articles/publications
|
| 311 |
+
"Изменение климата это серьезная проблема", # Statement
|
| 312 |
+
"Как бороться с изменением климата", # How-to
|
| 313 |
+
],
|
| 314 |
+
expected_rank=0
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
results['ru_naidi'] = {'success': success, 'top_score': top_score, 'expected': expected}
|
| 318 |
+
print(f"\n{'✅ PASS' if success else '❌ FAIL'}: RU 'Найди' → статьи/публикации")
|
| 319 |
+
print(f" Score: {expected:.3f}")
|
| 320 |
+
|
| 321 |
+
# ========================================================================
|
| 322 |
+
# FINAL SUMMARY
|
| 323 |
+
# ========================================================================
|
| 324 |
+
print_header("📊 MONOLINGUAL INSTRUCTION-AWARENESS SUMMARY", level=1)
|
| 325 |
+
|
| 326 |
+
# Calculate pass rates by language
|
| 327 |
+
languages = {
|
| 328 |
+
'Français (FR)': ['fr_explique', 'fr_trouve'],
|
| 329 |
+
'Español (ES)': ['es_explica', 'es_encuentra'],
|
| 330 |
+
'Deutsch (DE)': ['de_erklaere', 'de_finde'],
|
| 331 |
+
'中文 (ZH)': ['zh_jieshi', 'zh_chazhao'],
|
| 332 |
+
'العربية (AR)': ['ar_ishrah', 'ar_ibhath'],
|
| 333 |
+
'Русский (RU)': ['ru_obyasni', 'ru_naidi'],
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
print("\n╔═════════════════════════════════════════════════════════════════════════��════╗")
|
| 337 |
+
print("║ MONOLINGUAL TEST RESULTS ║")
|
| 338 |
+
print("╚══════════════════════════════════════════════════════════════════════════════╝\n")
|
| 339 |
+
|
| 340 |
+
overall_pass = 0
|
| 341 |
+
overall_total = 0
|
| 342 |
+
|
| 343 |
+
for lang_name, test_keys in languages.items():
|
| 344 |
+
pass_count = sum(1 for key in test_keys if results[key]['success'])
|
| 345 |
+
total_count = len(test_keys)
|
| 346 |
+
pass_rate = (pass_count / total_count) * 100
|
| 347 |
+
|
| 348 |
+
overall_pass += pass_count
|
| 349 |
+
overall_total += total_count
|
| 350 |
+
|
| 351 |
+
# Get average score
|
| 352 |
+
avg_score = np.mean([results[key]['expected'] for key in test_keys])
|
| 353 |
+
|
| 354 |
+
emoji = "✅" if pass_rate >= 50 else "⚠️" if pass_rate > 0 else "❌"
|
| 355 |
+
|
| 356 |
+
print(f"{emoji} {lang_name:20s}: {pass_count}/{total_count} tests passed ({pass_rate:.0f}%)")
|
| 357 |
+
print(f" Average score: {avg_score:.3f}")
|
| 358 |
+
|
| 359 |
+
overall_rate = (overall_pass / overall_total) * 100
|
| 360 |
+
|
| 361 |
+
print(f"\n{'=' * 80}")
|
| 362 |
+
print(f"OVERALL: {overall_pass}/{overall_total} tests passed ({overall_rate:.0f}%)")
|
| 363 |
+
print(f"{'=' * 80}\n")
|
| 364 |
+
|
| 365 |
+
# Analysis
|
| 366 |
+
print("🔬 ANALYSIS:\n")
|
| 367 |
+
|
| 368 |
+
# Group by script type
|
| 369 |
+
latin_tests = ['fr_explique', 'fr_trouve', 'es_explica', 'es_encuentra', 'de_erklaere', 'de_finde']
|
| 370 |
+
non_latin_tests = ['zh_jieshi', 'zh_chazhao', 'ar_ishrah', 'ar_ibhath', 'ru_obyasni', 'ru_naidi']
|
| 371 |
+
|
| 372 |
+
latin_pass = sum(1 for key in latin_tests if results[key]['success'])
|
| 373 |
+
latin_total = len(latin_tests)
|
| 374 |
+
latin_rate = (latin_pass / latin_total) * 100
|
| 375 |
+
|
| 376 |
+
non_latin_pass = sum(1 for key in non_latin_tests if results[key]['success'])
|
| 377 |
+
non_latin_total = len(non_latin_tests)
|
| 378 |
+
non_latin_rate = (non_latin_pass / non_latin_total) * 100
|
| 379 |
+
|
| 380 |
+
latin_avg_score = np.mean([results[key]['expected'] for key in latin_tests])
|
| 381 |
+
non_latin_avg_score = np.mean([results[key]['expected'] for key in non_latin_tests])
|
| 382 |
+
|
| 383 |
+
print(f"📊 Latin Scripts (FR/ES/DE):")
|
| 384 |
+
print(f" Pass rate: {latin_rate:.0f}% ({latin_pass}/{latin_total})")
|
| 385 |
+
print(f" Average score: {latin_avg_score:.3f}")
|
| 386 |
+
|
| 387 |
+
print(f"\n📊 Non-Latin Scripts (ZH/AR/RU):")
|
| 388 |
+
print(f" Pass rate: {non_latin_rate:.0f}% ({non_latin_pass}/{non_latin_total})")
|
| 389 |
+
print(f" Average score: {non_latin_avg_score:.3f}")
|
| 390 |
+
|
| 391 |
+
# Conclusion
|
| 392 |
+
print(f"\n💡 CONCLUSIONS:\n")
|
| 393 |
+
|
| 394 |
+
if latin_rate > 50:
|
| 395 |
+
print("✅ Latin-script languages (FR/ES/DE): Instruction-awareness WORKS monolingual")
|
| 396 |
+
else:
|
| 397 |
+
print("❌ Latin-script languages (FR/ES/DE): Instruction-awareness DOES NOT WORK")
|
| 398 |
+
|
| 399 |
+
if non_latin_rate > 50:
|
| 400 |
+
print("✅ Non-Latin scripts (ZH/AR/RU): Instruction-awareness WORKS monolingual")
|
| 401 |
+
else:
|
| 402 |
+
print("❌ Non-Latin scripts (ZH/AR/RU): Instruction-awareness DOES NOT WORK")
|
| 403 |
+
|
| 404 |
+
# Compare with EN baseline (94.96%)
|
| 405 |
+
en_baseline = 0.9496
|
| 406 |
+
print(f"\n📉 Performance vs English Baseline (94.96%):")
|
| 407 |
+
print(f" Latin scripts: -{(en_baseline - latin_avg_score)*100:.1f}% ({latin_avg_score:.1%} vs {en_baseline:.1%})")
|
| 408 |
+
print(f" Non-Latin scripts: -{(en_baseline - non_latin_avg_score)*100:.1f}% ({non_latin_avg_score:.1%} vs {en_baseline:.1%})")
|
| 409 |
+
|
| 410 |
+
# Save results
|
| 411 |
+
print("\n💾 Saving results to monolingual_test_results.json...")
|
| 412 |
+
import json
|
| 413 |
+
|
| 414 |
+
output = {
|
| 415 |
+
'summary': {
|
| 416 |
+
'overall_pass_rate': overall_rate / 100,
|
| 417 |
+
'latin_scripts_pass_rate': latin_rate / 100,
|
| 418 |
+
'non_latin_scripts_pass_rate': non_latin_rate / 100,
|
| 419 |
+
'latin_avg_score': float(latin_avg_score),
|
| 420 |
+
'non_latin_avg_score': float(non_latin_avg_score)
|
| 421 |
+
},
|
| 422 |
+
'by_language': {
|
| 423 |
+
lang_name: {
|
| 424 |
+
'tests': {
|
| 425 |
+
key: {
|
| 426 |
+
'success': bool(results[key]['success']),
|
| 427 |
+
'score': float(results[key]['expected'])
|
| 428 |
+
}
|
| 429 |
+
for key in test_keys
|
| 430 |
+
},
|
| 431 |
+
'pass_rate': float(sum(1 for key in test_keys if results[key]['success']) / len(test_keys))
|
| 432 |
+
}
|
| 433 |
+
for lang_name, test_keys in languages.items()
|
| 434 |
+
},
|
| 435 |
+
'all_results': {
|
| 436 |
+
key: {
|
| 437 |
+
'success': bool(value['success']),
|
| 438 |
+
'score': float(value['expected'])
|
| 439 |
+
}
|
| 440 |
+
for key, value in results.items()
|
| 441 |
+
}
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
with open('monolingual_test_results.json', 'w', encoding='utf-8') as f:
|
| 445 |
+
json.dump(output, f, indent=2, ensure_ascii=False)
|
| 446 |
+
|
| 447 |
+
print("✅ Results saved!")
|
| 448 |
+
|
| 449 |
+
print(f"""
|
| 450 |
+
╔══════════════════════════════════════════════════════════════════════════════╗
|
| 451 |
+
║ RECOMMENDATION UPDATE ║
|
| 452 |
+
╚══════════════════════════════════════════════════════════════════════════════╝
|
| 453 |
+
|
| 454 |
+
Based on these results, the model's monolingual instruction-awareness is:
|
| 455 |
+
|
| 456 |
+
✅ GOOD for: Latin scripts (FR/ES/DE) monolingual use - {latin_rate:.0f}% pass rate
|
| 457 |
+
❌ POOR for: Non-Latin scripts (ZH/AR/RU) monolingual use - {non_latin_rate:.0f}% pass rate
|
| 458 |
+
|
| 459 |
+
This confirms: The model is optimized for English and other Latin-script
|
| 460 |
+
languages, but NOT for non-Latin scripts even in monolingual mode.
|
| 461 |
+
""")
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
if __name__ == "__main__":
|
| 465 |
+
main()
|
examples/test_results_advanced.json
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"summary": {
|
| 3 |
+
"cross_lingual_pass_rate": 0.0,
|
| 4 |
+
"difficult_cases_pass_rate": 0.75,
|
| 5 |
+
"edge_cases_pass_rate": 0.0
|
| 6 |
+
},
|
| 7 |
+
"cross_lingual": [
|
| 8 |
+
{
|
| 9 |
+
"test": "FR→EN instruction",
|
| 10 |
+
"success": false,
|
| 11 |
+
"score_diff": -0.06680182105237953
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"test": "EN→FR instruction",
|
| 15 |
+
"success": false,
|
| 16 |
+
"score_diff": -0.21303130042796392
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"test": "FR→Multilingual",
|
| 20 |
+
"success": false,
|
| 21 |
+
"score_diff": -0.3979676336355793
|
| 22 |
+
}
|
| 23 |
+
],
|
| 24 |
+
"difficult_cases": [
|
| 25 |
+
{
|
| 26 |
+
"test": "Negative instruction (Avoid)",
|
| 27 |
+
"success": true,
|
| 28 |
+
"score_diff": 0.0
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"test": "Ambiguous: Train",
|
| 32 |
+
"success": false,
|
| 33 |
+
"score_diff": -0.013746112646522035
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"test": "Multiple intentions",
|
| 37 |
+
"success": true,
|
| 38 |
+
"score_diff": 0.0
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"test": "Formality matching",
|
| 42 |
+
"success": true,
|
| 43 |
+
"score_diff": 0.007767340301580772
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"edge_cases": [
|
| 47 |
+
{
|
| 48 |
+
"test": "Spelling errors",
|
| 49 |
+
"success": false,
|
| 50 |
+
"score_diff": -0.023126566420730188
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"test": "Very long query",
|
| 54 |
+
"success": false,
|
| 55 |
+
"score_diff": -0.06509758680256694
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"test": "Contradictory instructions",
|
| 59 |
+
"success": false,
|
| 60 |
+
"score_diff": -0.02864061742806956
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"test": "Non-Latin scripts",
|
| 64 |
+
"success": false,
|
| 65 |
+
"details": {
|
| 66 |
+
"Arabic": false,
|
| 67 |
+
"Russian": false,
|
| 68 |
+
"Chinese": false
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"degradation": [
|
| 73 |
+
{
|
| 74 |
+
"test": "Simple EN instruction",
|
| 75 |
+
"score": 0.9339406309985464,
|
| 76 |
+
"margin": -0.009695165515504423
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"test": "Cross-lingual FR→EN",
|
| 80 |
+
"score": 0.5904816604785096,
|
| 81 |
+
"margin": -0.0021998562159204482
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"test": "Cross-lingual with typos",
|
| 85 |
+
"score": 0.5781216603117493,
|
| 86 |
+
"margin": 0.010975424877498807
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"test": "Long cross-lingual query",
|
| 90 |
+
"score": 0.56935017490961,
|
| 91 |
+
"margin": 0.02394839991605835
|
| 92 |
+
}
|
| 93 |
+
]
|
| 94 |
+
}
|