import requests import json import time BASE_URL = "https://conquerorr000-turkish-medical-model-api.hf.space" def verify_deployment(): """Verify L4 optimization deployment""" print("� L4 Optimization Deployment Verification") print("=" * 50) # Check health try: response = requests.get(f"{BASE_URL}/health", timeout=30) health_data = response.json() if health_data.get("model_loaded"): print(" Model Status: Loaded and Ready") else: print(" Model Status: Not Ready") return False except Exception as e: print(f" Health Check Failed: {e}") return False # Check new endpoints endpoints_to_check = ["/memory-status", "/conversation"] for endpoint in endpoints_to_check: try: if endpoint == "/conversation": # Test with POST data test_data = { "messages": [{"role": "user", "content": "Test"}], "max_tokens": 50 } response = requests.post(f"{BASE_URL}{endpoint}", json=test_data, timeout=30) else: response = requests.get(f"{BASE_URL}{endpoint}", timeout=30) if response.status_code == 200: print(f" Endpoint {endpoint}: Working") else: print(f" Endpoint {endpoint}: Failed ({response.status_code})") except Exception as e: print(f" Endpoint {endpoint}: Error - {e}") # Check debug info for L4 optimizations try: response = requests.get(f"{BASE_URL}/debug", timeout=30) debug_data = response.json() optimization_info = debug_data.get("optimization_info", {}) print(f"\n L4 Optimization Status:") print(f" Precision: {optimization_info.get('precision', 'Unknown')}") print(f" Quantization: {optimization_info.get('quantization', 'Unknown')}") print(f" Flash Attention: {optimization_info.get('flash_attention', 'Unknown')}") print(f" TF32: {optimization_info.get('tf32', 'Unknown')}") print(f" LoRA Merged: {optimization_info.get('lora_merged', 'Unknown')}") # Check if optimizations are active optimizations_active = ( optimization_info.get('precision') == 'FP16' and optimization_info.get('quantization') == 'None' and optimization_info.get('flash_attention') == 'Enabled' ) if optimizations_active: print(" L4 Optimizations: Active") else: print(" L4 Optimizations: May not be fully active") except Exception as e: print(f" Debug Check Failed: {e}") # Quick performance test try: print(f"\n Quick Performance Test:") start_time = time.time() response = requests.get(f"{BASE_URL}/test", timeout=60) if response.status_code == 200: test_data = response.json() generation_time = test_data.get("generation_time", 0) print(f" Generation Time: {generation_time:.2f}s") if generation_time < 6: print(" Performance: Excellent") elif generation_time < 8: print(" Performance: Good") else: print(" Performance: Needs Optimization") else: print(" Performance Test Failed") except Exception as e: print(f" Performance Test Error: {e}") print(f"\n{'='*50}") print(" Verification Complete!") if __name__ == "__main__": verify_deployment()