turkish-medical-model-api / app_backup_20250725_100843.py
Conquerorr0
feat: L4 optimization deployment - 2025-07-25 10:08
dc4fe0a
#!/usr/bin/env python3
"""
Turkish Medical Model API - Nvidia L4 24GB Optimized
Maximum performance for medical conversations
"""
import os
import shutil
import logging
import time
import asyncio
import gc
from typing import Dict, Optional, List
import json
# CRITICAL: Set cache directories BEFORE importing anything
CACHE_DIR = "/tmp/hf_cache"
TRITON_CACHE = "/tmp/triton_cache"
# Set environment variables for L4 optimization
os.environ["HF_HOME"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
os.environ["HF_HUB_CACHE"] = CACHE_DIR
os.environ["TRITON_CACHE_DIR"] = TRITON_CACHE
os.environ["CUDA_CACHE_PATH"] = "/tmp/cuda_cache"
# L4 Performance optimization
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
os.environ["TORCH_USE_CUDA_DSA"] = "1" # Enable CUDA DSA for L4
# Enable optimizations for L4
os.environ["TORCH_COMPILE"] = "0" # Keep disabled for stability
os.environ["PYTORCH_COMPILE"] = "0"
# Create cache directories
for cache_path in [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache"]:
os.makedirs(cache_path, exist_ok=True)
os.chmod(cache_path, 0o777)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from peft import PeftModel
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Get HF token
HF_TOKEN = os.getenv("HF_TOKEN")
# Global variables
tokenizer = None
model = None
generation_config = None
model_loaded = False
loading_error = None
device = "cuda:0"
# Medical conversation templates
MEDICAL_TEMPLATES = {
"greeting": "Merhaba, ben anamnez asistanıyım. Size nasıl yardımcı olabilirim? Herhangi bir şikayetiniz var mı?",
"follow_up": "Bu konuda daha fazla bilgi verebilir misiniz?",
"clarification": "Anlıyorum. Bu durumla ilgili başka belirtileriniz var mı?",
"closing": "Teşekkür ederim. Anamnez tamamlandı."
}
# Pydantic models
class Message(BaseModel):
role: str # "user" or "assistant"
content: str
class ChatRequest(BaseModel):
message: str
max_tokens: int = 200
temperature: float = 0.7
conversation_history: Optional[List[Message]] = []
class ConversationRequest(BaseModel):
messages: List[Message]
max_tokens: int = 200
temperature: float = 0.7
class ChatResponse(BaseModel):
response: str
generation_time: float
tokens_generated: int
conversation_turn: int
class HealthResponse(BaseModel):
status: str
model_loaded: bool
gpu_available: bool
error: Optional[str] = None
def setup_l4_optimization():
"""Setup optimizations specific to Nvidia L4"""
if torch.cuda.is_available():
# L4 specific settings
torch.cuda.set_per_process_memory_fraction(0.9) # Use 90% of 24GB
torch.backends.cuda.matmul.allow_tf32 = True # Enable TF32 for L4
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.enable_flash_sdp(True) # Enable Flash Attention
logger.info("🎯 L4 optimizations enabled: TF32, Flash Attention")
def clear_gpu_memory():
"""Optimized GPU memory cleanup for L4"""
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
gc.collect()
def setup_cache_directories():
"""Setup cache directories"""
cache_dirs = [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache", "/tmp/.cache"]
for cache_dir in cache_dirs:
try:
os.makedirs(cache_dir, exist_ok=True)
os.chmod(cache_dir, 0o777)
logger.info(f"✅ Created cache dir: {cache_dir}")
except Exception as e:
logger.warning(f"⚠️ Could not create {cache_dir}: {e}")
def clear_cache_locks():
"""Clear cache locks"""
try:
all_cache_dirs = [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache", "/tmp/.cache"]
for cache_dir in all_cache_dirs:
if os.path.exists(cache_dir):
for root, dirs, files in os.walk(cache_dir):
for file in files:
if file.endswith('.lock') or file.endswith('.incomplete'):
lock_file = os.path.join(root, file)
try:
os.remove(lock_file)
except:
pass
except Exception as e:
logger.warning(f"Could not clear cache locks: {e}")
def format_medical_conversation(messages: List[Message]) -> str:
"""Format conversation for medical context"""
conversation = ""
for i, msg in enumerate(messages):
if msg.role == "assistant":
conversation += f"Doktor: {msg.content}\n"
else:
conversation += f"Hasta: {msg.content}\n"
conversation += "Doktor:"
return conversation
def clean_medical_response(response: str) -> str:
"""Clean and validate medical response"""
# Remove extra whitespace
response = response.strip()
# Remove role prefixes if they appear in response
prefixes_to_remove = ["Doktor:", "Hasta:", "Assistant:", "Human:"]
for prefix in prefixes_to_remove:
if response.startswith(prefix):
response = response[len(prefix):].strip()
# Split by sentences and clean
sentences = response.split('.')
clean_sentences = []
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) > 10 and not sentence.startswith("Hasta"):
clean_sentences.append(sentence)
if len(clean_sentences) >= 3: # Limit to 3 sentences for clarity
break
if clean_sentences:
response = '. '.join(clean_sentences)
if not response.endswith('.'):
response += '.'
else:
# Fallback response
response = "Bu konuda size yardımcı olmaya çalışayım. Lütfen belirtilerinizi daha detaylı anlatabilir misiniz?"
return response
async def load_model():
"""Load model optimized for Nvidia L4 24GB"""
global tokenizer, model, generation_config, model_loaded, loading_error
if model_loaded:
return True
try:
logger.info("🚀 Loading Turkish Medical Model - L4 24GB Optimized...")
setup_cache_directories()
clear_cache_locks()
setup_l4_optimization()
clear_gpu_memory()
start_time = time.time()
# Check L4 memory
if torch.cuda.is_available():
props = torch.cuda.get_device_properties(0)
total_memory = props.total_memory / (1024**3)
logger.info(f"🎮 GPU: {props.name}")
logger.info(f"🎮 Total VRAM: {total_memory:.1f}GB")
# Load tokenizer
logger.info("📚 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
"Conquerorr000/llama-3.1-8b-turkish-medical-lora",
cache_dir=CACHE_DIR,
trust_remote_code=True,
token=HF_TOKEN,
use_fast=True # Use fast tokenizer for speed
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
logger.info("✅ Tokenizer loaded successfully")
# Load base model - NO QUANTIZATION for L4
logger.info("🧠 Loading base model (FP16 - Full Precision)...")
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Meta-Llama-3.1-8B-Instruct",
cache_dir=CACHE_DIR,
torch_dtype=torch.float16, # FP16 for speed
device_map="auto",
trust_remote_code=True,
low_cpu_mem_usage=True,
token=HF_TOKEN,
attn_implementation="flash_attention_2", # Use Flash Attention 2 if available
use_cache=True
)
logger.info("✅ Base model loaded (FP16)")
# Check memory after base model
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated(0) / (1024**3)
logger.info(f"🎮 Memory after base model: {allocated:.2f}GB")
# Load LoRA adapter
logger.info("🎯 Loading LoRA adapter...")
try:
lora_model = PeftModel.from_pretrained(
model,
"Conquerorr000/llama-3.1-8b-turkish-medical-lora",
cache_dir=CACHE_DIR,
torch_dtype=torch.float16,
token=HF_TOKEN,
is_trainable=False
)
logger.info("✅ LoRA adapter loaded successfully")
# Merge LoRA for better performance (L4 has enough memory)
logger.info("🔗 Merging LoRA adapter for better performance...")
try:
model = lora_model.merge_and_unload()
logger.info("✅ LoRA merged successfully")
except Exception as merge_error:
logger.warning(f"⚠️ LoRA merge failed: {merge_error}")
model = lora_model
logger.info("📝 Using LoRA as adapter")
except Exception as lora_error:
logger.warning(f"⚠️ LoRA loading failed: {lora_error}")
logger.info("📝 Continuing with base model only")
# Setup optimized generation config
generation_config = GenerationConfig(
max_new_tokens=200,
temperature=0.7,
top_p=0.9,
top_k=50,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1,
no_repeat_ngram_size=3,
use_cache=True
)
# Set to evaluation mode
model.eval()
# Final memory cleanup
clear_gpu_memory()
loading_time = time.time() - start_time
logger.info(f"✅ Model loaded successfully in {loading_time:.2f}s")
# Log final memory usage
if torch.cuda.is_available():
total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
allocated = torch.cuda.memory_allocated(0) / (1024**3)
free = total_memory - allocated
logger.info(f"🎮 Final Memory: Allocated={allocated:.2f}GB, Free={free:.2f}GB")
model_loaded = True
return True
except Exception as e:
error_msg = f"Model loading failed: {str(e)}"
logger.error(f"❌ {error_msg}")
loading_error = error_msg
model_loaded = False
clear_gpu_memory()
return False
async def generate_response(messages: List[Message], max_tokens: int = 200, temperature: float = 0.7) -> Dict:
"""Generate medical response with L4 optimization"""
global model, tokenizer, generation_config
if not model_loaded:
raise HTTPException(status_code=503, detail="Model not loaded")
try:
start_time = time.time()
# Format conversation for medical context
conversation_text = format_medical_conversation(messages)
# Tokenize with optimizations
inputs = tokenizer(
conversation_text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=1024,
add_special_tokens=True
)
# Move to GPU
inputs = {k: v.to(device) for k, v in inputs.items()}
# Update generation config
gen_config = generation_config
gen_config.max_new_tokens = min(max_tokens, 200)
gen_config.temperature = temperature
# Generate with L4 optimizations
with torch.no_grad():
outputs = model.generate(
**inputs,
generation_config=gen_config,
attention_mask=inputs.get("attention_mask"),
use_cache=True
)
# Decode response
input_length = inputs["input_ids"].shape[1]
generated_ids = outputs[0][input_length:]
generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
# Clean medical response
response = clean_medical_response(generated_text)
generation_time = time.time() - start_time
# Cleanup
del outputs, generated_ids
torch.cuda.empty_cache()
return {
"response": response,
"generation_time": round(generation_time, 3),
"tokens_generated": len(generated_text.split()),
"conversation_turn": len(messages) + 1
}
except Exception as e:
logger.error(f"Generation error: {e}")
torch.cuda.empty_cache()
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
# Create FastAPI app
app = FastAPI(
title="Turkish Medical Model API",
description="Turkish medical conversation model - L4 24GB Optimized",
version="2.0.0"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.on_event("startup")
async def startup_event():
"""Load model on startup"""
logger.info("🚀 Starting API server...")
logger.info(f"📁 HF Cache: {CACHE_DIR}")
logger.info(f"🎮 Target GPU: Nvidia L4 24GB")
logger.info(f"💾 Mode: FP16 Full Precision")
logger.info(f"⚡ Optimizations: TF32, Flash Attention")
if HF_TOKEN:
logger.info("✅ HF Token found")
else:
logger.info("ℹ️ No HF Token")
setup_cache_directories()
clear_cache_locks()
setup_l4_optimization()
# Start model loading
asyncio.create_task(load_model())
@app.get("/", response_model=HealthResponse)
async def root():
return HealthResponse(
status="healthy" if model_loaded else "loading",
model_loaded=model_loaded,
gpu_available=torch.cuda.is_available(),
error=loading_error
)
@app.get("/health", response_model=HealthResponse)
async def health_check():
return HealthResponse(
status="healthy" if model_loaded else "loading",
model_loaded=model_loaded,
gpu_available=torch.cuda.is_available(),
error=loading_error
)
@app.post("/chat", response_model=ChatResponse)
async def chat_endpoint(request: ChatRequest):
"""Single message chat endpoint"""
try:
# Convert single message to conversation format
messages = request.conversation_history or []
messages.append(Message(role="user", content=request.message))
result = await generate_response(
messages,
request.max_tokens,
request.temperature
)
return ChatResponse(
response=result["response"],
generation_time=result["generation_time"],
tokens_generated=result["tokens_generated"],
conversation_turn=result["conversation_turn"]
)
except Exception as e:
logger.error(f"Chat error: {e}")
raise HTTPException(status_code=500, detail=f"Chat failed: {str(e)}")
@app.post("/conversation", response_model=ChatResponse)
async def conversation_endpoint(request: ConversationRequest):
"""Full conversation endpoint"""
try:
result = await generate_response(
request.messages,
request.max_tokens,
request.temperature
)
return ChatResponse(
response=result["response"],
generation_time=result["generation_time"],
tokens_generated=result["tokens_generated"],
conversation_turn=result["conversation_turn"]
)
except Exception as e:
logger.error(f"Conversation error: {e}")
raise HTTPException(status_code=500, detail=f"Conversation failed: {str(e)}")
@app.get("/test")
async def test_endpoint():
"""Medical conversation test"""
if not model_loaded:
return {
"status": "model_not_ready",
"message": "Model is still loading...",
"error": loading_error
}
try:
# Test medical conversation
test_messages = [
Message(role="user", content="Merhaba doktor, 2 gündür başım ağrıyor ve ateşim var.")
]
result = await generate_response(test_messages, 150, 0.7)
return {
"status": "success",
"test_input": test_messages[0].content,
"test_output": result["response"],
"generation_time": result["generation_time"],
"device_info": device,
"conversation_turn": result["conversation_turn"]
}
except Exception as e:
logger.error(f"Test error: {e}")
return {
"status": "error",
"message": f"Test failed: {str(e)}"
}
@app.get("/memory-status")
async def memory_status():
"""Get GPU memory status"""
memory_info = {"gpu_available": torch.cuda.is_available()}
if torch.cuda.is_available():
props = torch.cuda.get_device_properties(0)
total_memory = props.total_memory / (1024**3)
allocated = torch.cuda.memory_allocated(0) / (1024**3)
reserved = torch.cuda.memory_reserved(0) / (1024**3)
free = total_memory - allocated
memory_info.update({
"gpu_name": props.name,
"total_memory_gb": round(total_memory, 2),
"allocated_memory_gb": round(allocated, 2),
"reserved_memory_gb": round(reserved, 2),
"free_memory_gb": round(free, 2),
"utilization_percent": round((allocated / total_memory) * 100, 1)
})
return memory_info
@app.get("/debug")
async def debug_info():
"""Enhanced debug information"""
model_device_info = {}
if model:
try:
devices = set()
for param in model.parameters():
devices.add(str(param.device))
break # Just check first parameter
model_device_info = {
"model_devices": list(devices),
"device_consistent": len(devices) == 1,
"first_param_device": str(next(model.parameters()).device)
}
except:
model_device_info = {"error": "Could not get model device info"}
memory_info = await memory_status()
return {
"model_status": {
"model_loaded": model_loaded,
"loading_error": loading_error,
"model_type": type(model).__name__ if model else None,
**model_device_info
},
"system_info": {
"target_device": device,
"gpu_available": torch.cuda.is_available(),
"torch_version": torch.__version__,
"cuda_version": torch.version.cuda if torch.cuda.is_available() else None
},
"memory_info": memory_info,
"optimization_info": {
"precision": "FP16",
"quantization": "None",
"flash_attention": "Enabled",
"tf32": "Enabled",
"lora_merged": "Yes" if model_loaded else "Unknown"
},
"cache_info": {
"hf_cache": CACHE_DIR,
"cache_exists": os.path.exists(CACHE_DIR),
"cache_writable": os.access(CACHE_DIR, os.W_OK) if os.path.exists(CACHE_DIR) else False
}
}
if __name__ == "__main__":
import uvicorn
uvicorn.run("app:app", host="0.0.0.0", port=7860)