turkish-medical-model-api / scripts /verify_l4_deployment.py
Conquerorr0
feat: L4 optimization deployment - 2025-07-25 10:08
dc4fe0a
import requests
import json
import time
BASE_URL = "https://conquerorr000-turkish-medical-model-api.hf.space"
def verify_deployment():
"""Verify L4 optimization deployment"""
print("� L4 Optimization Deployment Verification")
print("=" * 50)
# Check health
try:
response = requests.get(f"{BASE_URL}/health", timeout=30)
health_data = response.json()
if health_data.get("model_loaded"):
print(" Model Status: Loaded and Ready")
else:
print(" Model Status: Not Ready")
return False
except Exception as e:
print(f" Health Check Failed: {e}")
return False
# Check new endpoints
endpoints_to_check = ["/memory-status", "/conversation"]
for endpoint in endpoints_to_check:
try:
if endpoint == "/conversation":
# Test with POST data
test_data = {
"messages": [{"role": "user", "content": "Test"}],
"max_tokens": 50
}
response = requests.post(f"{BASE_URL}{endpoint}", json=test_data, timeout=30)
else:
response = requests.get(f"{BASE_URL}{endpoint}", timeout=30)
if response.status_code == 200:
print(f" Endpoint {endpoint}: Working")
else:
print(f" Endpoint {endpoint}: Failed ({response.status_code})")
except Exception as e:
print(f" Endpoint {endpoint}: Error - {e}")
# Check debug info for L4 optimizations
try:
response = requests.get(f"{BASE_URL}/debug", timeout=30)
debug_data = response.json()
optimization_info = debug_data.get("optimization_info", {})
print(f"\n L4 Optimization Status:")
print(f" Precision: {optimization_info.get('precision', 'Unknown')}")
print(f" Quantization: {optimization_info.get('quantization', 'Unknown')}")
print(f" Flash Attention: {optimization_info.get('flash_attention', 'Unknown')}")
print(f" TF32: {optimization_info.get('tf32', 'Unknown')}")
print(f" LoRA Merged: {optimization_info.get('lora_merged', 'Unknown')}")
# Check if optimizations are active
optimizations_active = (
optimization_info.get('precision') == 'FP16' and
optimization_info.get('quantization') == 'None' and
optimization_info.get('flash_attention') == 'Enabled'
)
if optimizations_active:
print(" L4 Optimizations: Active")
else:
print(" L4 Optimizations: May not be fully active")
except Exception as e:
print(f" Debug Check Failed: {e}")
# Quick performance test
try:
print(f"\n Quick Performance Test:")
start_time = time.time()
response = requests.get(f"{BASE_URL}/test", timeout=60)
if response.status_code == 200:
test_data = response.json()
generation_time = test_data.get("generation_time", 0)
print(f" Generation Time: {generation_time:.2f}s")
if generation_time < 6:
print(" Performance: Excellent")
elif generation_time < 8:
print(" Performance: Good")
else:
print(" Performance: Needs Optimization")
else:
print(" Performance Test Failed")
except Exception as e:
print(f" Performance Test Error: {e}")
print(f"\n{'='*50}")
print(" Verification Complete!")
if __name__ == "__main__":
verify_deployment()