|
import requests |
|
import json |
|
import time |
|
|
|
BASE_URL = "https://conquerorr000-turkish-medical-model-api.hf.space" |
|
|
|
def verify_deployment(): |
|
"""Verify L4 optimization deployment""" |
|
print("� L4 Optimization Deployment Verification") |
|
print("=" * 50) |
|
|
|
|
|
try: |
|
response = requests.get(f"{BASE_URL}/health", timeout=30) |
|
health_data = response.json() |
|
|
|
if health_data.get("model_loaded"): |
|
print(" Model Status: Loaded and Ready") |
|
else: |
|
print(" Model Status: Not Ready") |
|
return False |
|
|
|
except Exception as e: |
|
print(f" Health Check Failed: {e}") |
|
return False |
|
|
|
|
|
endpoints_to_check = ["/memory-status", "/conversation"] |
|
|
|
for endpoint in endpoints_to_check: |
|
try: |
|
if endpoint == "/conversation": |
|
|
|
test_data = { |
|
"messages": [{"role": "user", "content": "Test"}], |
|
"max_tokens": 50 |
|
} |
|
response = requests.post(f"{BASE_URL}{endpoint}", json=test_data, timeout=30) |
|
else: |
|
response = requests.get(f"{BASE_URL}{endpoint}", timeout=30) |
|
|
|
if response.status_code == 200: |
|
print(f" Endpoint {endpoint}: Working") |
|
else: |
|
print(f" Endpoint {endpoint}: Failed ({response.status_code})") |
|
|
|
except Exception as e: |
|
print(f" Endpoint {endpoint}: Error - {e}") |
|
|
|
|
|
try: |
|
response = requests.get(f"{BASE_URL}/debug", timeout=30) |
|
debug_data = response.json() |
|
|
|
optimization_info = debug_data.get("optimization_info", {}) |
|
|
|
print(f"\n L4 Optimization Status:") |
|
print(f" Precision: {optimization_info.get('precision', 'Unknown')}") |
|
print(f" Quantization: {optimization_info.get('quantization', 'Unknown')}") |
|
print(f" Flash Attention: {optimization_info.get('flash_attention', 'Unknown')}") |
|
print(f" TF32: {optimization_info.get('tf32', 'Unknown')}") |
|
print(f" LoRA Merged: {optimization_info.get('lora_merged', 'Unknown')}") |
|
|
|
|
|
optimizations_active = ( |
|
optimization_info.get('precision') == 'FP16' and |
|
optimization_info.get('quantization') == 'None' and |
|
optimization_info.get('flash_attention') == 'Enabled' |
|
) |
|
|
|
if optimizations_active: |
|
print(" L4 Optimizations: Active") |
|
else: |
|
print(" L4 Optimizations: May not be fully active") |
|
|
|
except Exception as e: |
|
print(f" Debug Check Failed: {e}") |
|
|
|
|
|
try: |
|
print(f"\n Quick Performance Test:") |
|
start_time = time.time() |
|
|
|
response = requests.get(f"{BASE_URL}/test", timeout=60) |
|
|
|
if response.status_code == 200: |
|
test_data = response.json() |
|
generation_time = test_data.get("generation_time", 0) |
|
|
|
print(f" Generation Time: {generation_time:.2f}s") |
|
|
|
if generation_time < 6: |
|
print(" Performance: Excellent") |
|
elif generation_time < 8: |
|
print(" Performance: Good") |
|
else: |
|
print(" Performance: Needs Optimization") |
|
|
|
else: |
|
print(" Performance Test Failed") |
|
|
|
except Exception as e: |
|
print(f" Performance Test Error: {e}") |
|
|
|
print(f"\n{'='*50}") |
|
print(" Verification Complete!") |
|
|
|
if __name__ == "__main__": |
|
verify_deployment() |
|
|