#!/usr/bin/env python3 """ Turkish Medical Model API - Nvidia L4 24GB Optimized Maximum performance for medical conversations """ import os import shutil import logging import time import asyncio import gc from typing import Dict, Optional, List import json # CRITICAL: Set cache directories BEFORE importing anything CACHE_DIR = "/tmp/hf_cache" TRITON_CACHE = "/tmp/triton_cache" # Set environment variables for L4 optimization os.environ["HF_HOME"] = CACHE_DIR os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR os.environ["HF_DATASETS_CACHE"] = CACHE_DIR os.environ["HF_HUB_CACHE"] = CACHE_DIR os.environ["TRITON_CACHE_DIR"] = TRITON_CACHE os.environ["CUDA_CACHE_PATH"] = "/tmp/cuda_cache" # L4 Performance optimization os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" os.environ["CUDA_LAUNCH_BLOCKING"] = "0" os.environ["TORCH_USE_CUDA_DSA"] = "1" # Enable CUDA DSA for L4 # Enable optimizations for L4 os.environ["TORCH_COMPILE"] = "0" # Keep disabled for stability os.environ["PYTORCH_COMPILE"] = "0" # Create cache directories for cache_path in [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache"]: os.makedirs(cache_path, exist_ok=True) os.chmod(cache_path, 0o777) import torch from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig from peft import PeftModel from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Get HF token HF_TOKEN = os.getenv("HF_TOKEN") # Global variables tokenizer = None model = None generation_config = None model_loaded = False loading_error = None device = "cuda:0" # Medical conversation templates MEDICAL_TEMPLATES = { "greeting": "Merhaba, ben anamnez asistanıyım. Size nasıl yardımcı olabilirim? Herhangi bir şikayetiniz var mı?", "follow_up": "Bu konuda daha fazla bilgi verebilir misiniz?", "clarification": "Anlıyorum. Bu durumla ilgili başka belirtileriniz var mı?", "closing": "Teşekkür ederim. Anamnez tamamlandı." } # Pydantic models class Message(BaseModel): role: str # "user" or "assistant" content: str class ChatRequest(BaseModel): message: str max_tokens: int = 200 temperature: float = 0.7 conversation_history: Optional[List[Message]] = [] class ConversationRequest(BaseModel): messages: List[Message] max_tokens: int = 200 temperature: float = 0.7 class ChatResponse(BaseModel): response: str generation_time: float tokens_generated: int conversation_turn: int class HealthResponse(BaseModel): status: str model_loaded: bool gpu_available: bool error: Optional[str] = None def setup_l4_optimization(): """Setup optimizations specific to Nvidia L4""" if torch.cuda.is_available(): # L4 specific settings torch.cuda.set_per_process_memory_fraction(0.9) # Use 90% of 24GB torch.backends.cuda.matmul.allow_tf32 = True # Enable TF32 for L4 torch.backends.cudnn.allow_tf32 = True torch.backends.cuda.enable_flash_sdp(True) # Enable Flash Attention logger.info("🎯 L4 optimizations enabled: TF32, Flash Attention") def clear_gpu_memory(): """Optimized GPU memory cleanup for L4""" if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() gc.collect() def setup_cache_directories(): """Setup cache directories""" cache_dirs = [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache", "/tmp/.cache"] for cache_dir in cache_dirs: try: os.makedirs(cache_dir, exist_ok=True) os.chmod(cache_dir, 0o777) logger.info(f"✅ Created cache dir: {cache_dir}") except Exception as e: logger.warning(f"⚠️ Could not create {cache_dir}: {e}") def clear_cache_locks(): """Clear cache locks""" try: all_cache_dirs = [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache", "/tmp/.cache"] for cache_dir in all_cache_dirs: if os.path.exists(cache_dir): for root, dirs, files in os.walk(cache_dir): for file in files: if file.endswith('.lock') or file.endswith('.incomplete'): lock_file = os.path.join(root, file) try: os.remove(lock_file) except: pass except Exception as e: logger.warning(f"Could not clear cache locks: {e}") def format_medical_conversation(messages: List[Message]) -> str: """Format conversation for medical context""" conversation = "" for i, msg in enumerate(messages): if msg.role == "assistant": conversation += f"Doktor: {msg.content}\n" else: conversation += f"Hasta: {msg.content}\n" conversation += "Doktor:" return conversation def clean_medical_response(response: str) -> str: """Clean and validate medical response""" # Remove extra whitespace response = response.strip() # Remove role prefixes if they appear in response prefixes_to_remove = ["Doktor:", "Hasta:", "Assistant:", "Human:"] for prefix in prefixes_to_remove: if response.startswith(prefix): response = response[len(prefix):].strip() # Split by sentences and clean sentences = response.split('.') clean_sentences = [] for sentence in sentences: sentence = sentence.strip() if len(sentence) > 10 and not sentence.startswith("Hasta"): clean_sentences.append(sentence) if len(clean_sentences) >= 3: # Limit to 3 sentences for clarity break if clean_sentences: response = '. '.join(clean_sentences) if not response.endswith('.'): response += '.' else: # Fallback response response = "Bu konuda size yardımcı olmaya çalışayım. Lütfen belirtilerinizi daha detaylı anlatabilir misiniz?" return response async def load_model(): """Load model optimized for Nvidia L4 24GB""" global tokenizer, model, generation_config, model_loaded, loading_error if model_loaded: return True try: logger.info("🚀 Loading Turkish Medical Model - L4 24GB Optimized...") setup_cache_directories() clear_cache_locks() setup_l4_optimization() clear_gpu_memory() start_time = time.time() # Check L4 memory if torch.cuda.is_available(): props = torch.cuda.get_device_properties(0) total_memory = props.total_memory / (1024**3) logger.info(f"🎮 GPU: {props.name}") logger.info(f"🎮 Total VRAM: {total_memory:.1f}GB") # Load tokenizer logger.info("📚 Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( "Conquerorr000/llama-3.1-8b-turkish-medical-lora", cache_dir=CACHE_DIR, trust_remote_code=True, token=HF_TOKEN, use_fast=True # Use fast tokenizer for speed ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token_id = tokenizer.eos_token_id logger.info("✅ Tokenizer loaded successfully") # Load base model - NO QUANTIZATION for L4 logger.info("🧠 Loading base model (FP16 - Full Precision)...") model = AutoModelForCausalLM.from_pretrained( "meta-llama/Meta-Llama-3.1-8B-Instruct", cache_dir=CACHE_DIR, torch_dtype=torch.float16, # FP16 for speed device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True, token=HF_TOKEN, attn_implementation="flash_attention_2", # Use Flash Attention 2 if available use_cache=True ) logger.info("✅ Base model loaded (FP16)") # Check memory after base model if torch.cuda.is_available(): allocated = torch.cuda.memory_allocated(0) / (1024**3) logger.info(f"🎮 Memory after base model: {allocated:.2f}GB") # Load LoRA adapter logger.info("🎯 Loading LoRA adapter...") try: lora_model = PeftModel.from_pretrained( model, "Conquerorr000/llama-3.1-8b-turkish-medical-lora", cache_dir=CACHE_DIR, torch_dtype=torch.float16, token=HF_TOKEN, is_trainable=False ) logger.info("✅ LoRA adapter loaded successfully") # Merge LoRA for better performance (L4 has enough memory) logger.info("🔗 Merging LoRA adapter for better performance...") try: model = lora_model.merge_and_unload() logger.info("✅ LoRA merged successfully") except Exception as merge_error: logger.warning(f"⚠️ LoRA merge failed: {merge_error}") model = lora_model logger.info("📝 Using LoRA as adapter") except Exception as lora_error: logger.warning(f"⚠️ LoRA loading failed: {lora_error}") logger.info("📝 Continuing with base model only") # Setup optimized generation config generation_config = GenerationConfig( max_new_tokens=200, temperature=0.7, top_p=0.9, top_k=50, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, repetition_penalty=1.1, no_repeat_ngram_size=3, use_cache=True ) # Set to evaluation mode model.eval() # Final memory cleanup clear_gpu_memory() loading_time = time.time() - start_time logger.info(f"✅ Model loaded successfully in {loading_time:.2f}s") # Log final memory usage if torch.cuda.is_available(): total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) allocated = torch.cuda.memory_allocated(0) / (1024**3) free = total_memory - allocated logger.info(f"🎮 Final Memory: Allocated={allocated:.2f}GB, Free={free:.2f}GB") model_loaded = True return True except Exception as e: error_msg = f"Model loading failed: {str(e)}" logger.error(f"❌ {error_msg}") loading_error = error_msg model_loaded = False clear_gpu_memory() return False async def generate_response(messages: List[Message], max_tokens: int = 200, temperature: float = 0.7) -> Dict: """Generate medical response with L4 optimization""" global model, tokenizer, generation_config if not model_loaded: raise HTTPException(status_code=503, detail="Model not loaded") try: start_time = time.time() # Format conversation for medical context conversation_text = format_medical_conversation(messages) # Tokenize with optimizations inputs = tokenizer( conversation_text, return_tensors="pt", padding=True, truncation=True, max_length=1024, add_special_tokens=True ) # Move to GPU inputs = {k: v.to(device) for k, v in inputs.items()} # Update generation config gen_config = generation_config gen_config.max_new_tokens = min(max_tokens, 200) gen_config.temperature = temperature # Generate with L4 optimizations with torch.no_grad(): outputs = model.generate( **inputs, generation_config=gen_config, attention_mask=inputs.get("attention_mask"), use_cache=True ) # Decode response input_length = inputs["input_ids"].shape[1] generated_ids = outputs[0][input_length:] generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True) # Clean medical response response = clean_medical_response(generated_text) generation_time = time.time() - start_time # Cleanup del outputs, generated_ids torch.cuda.empty_cache() return { "response": response, "generation_time": round(generation_time, 3), "tokens_generated": len(generated_text.split()), "conversation_turn": len(messages) + 1 } except Exception as e: logger.error(f"Generation error: {e}") torch.cuda.empty_cache() raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") # Create FastAPI app app = FastAPI( title="Turkish Medical Model API", description="Turkish medical conversation model - L4 24GB Optimized", version="2.0.0" ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.on_event("startup") async def startup_event(): """Load model on startup""" logger.info("🚀 Starting API server...") logger.info(f"📁 HF Cache: {CACHE_DIR}") logger.info(f"🎮 Target GPU: Nvidia L4 24GB") logger.info(f"💾 Mode: FP16 Full Precision") logger.info(f"⚡ Optimizations: TF32, Flash Attention") if HF_TOKEN: logger.info("✅ HF Token found") else: logger.info("ℹ️ No HF Token") setup_cache_directories() clear_cache_locks() setup_l4_optimization() # Start model loading asyncio.create_task(load_model()) @app.get("/", response_model=HealthResponse) async def root(): return HealthResponse( status="healthy" if model_loaded else "loading", model_loaded=model_loaded, gpu_available=torch.cuda.is_available(), error=loading_error ) @app.get("/health", response_model=HealthResponse) async def health_check(): return HealthResponse( status="healthy" if model_loaded else "loading", model_loaded=model_loaded, gpu_available=torch.cuda.is_available(), error=loading_error ) @app.post("/chat", response_model=ChatResponse) async def chat_endpoint(request: ChatRequest): """Single message chat endpoint""" try: # Convert single message to conversation format messages = request.conversation_history or [] messages.append(Message(role="user", content=request.message)) result = await generate_response( messages, request.max_tokens, request.temperature ) return ChatResponse( response=result["response"], generation_time=result["generation_time"], tokens_generated=result["tokens_generated"], conversation_turn=result["conversation_turn"] ) except Exception as e: logger.error(f"Chat error: {e}") raise HTTPException(status_code=500, detail=f"Chat failed: {str(e)}") @app.post("/conversation", response_model=ChatResponse) async def conversation_endpoint(request: ConversationRequest): """Full conversation endpoint""" try: result = await generate_response( request.messages, request.max_tokens, request.temperature ) return ChatResponse( response=result["response"], generation_time=result["generation_time"], tokens_generated=result["tokens_generated"], conversation_turn=result["conversation_turn"] ) except Exception as e: logger.error(f"Conversation error: {e}") raise HTTPException(status_code=500, detail=f"Conversation failed: {str(e)}") @app.get("/test") async def test_endpoint(): """Medical conversation test""" if not model_loaded: return { "status": "model_not_ready", "message": "Model is still loading...", "error": loading_error } try: # Test medical conversation test_messages = [ Message(role="user", content="Merhaba doktor, 2 gündür başım ağrıyor ve ateşim var.") ] result = await generate_response(test_messages, 150, 0.7) return { "status": "success", "test_input": test_messages[0].content, "test_output": result["response"], "generation_time": result["generation_time"], "device_info": device, "conversation_turn": result["conversation_turn"] } except Exception as e: logger.error(f"Test error: {e}") return { "status": "error", "message": f"Test failed: {str(e)}" } @app.get("/memory-status") async def memory_status(): """Get GPU memory status""" memory_info = {"gpu_available": torch.cuda.is_available()} if torch.cuda.is_available(): props = torch.cuda.get_device_properties(0) total_memory = props.total_memory / (1024**3) allocated = torch.cuda.memory_allocated(0) / (1024**3) reserved = torch.cuda.memory_reserved(0) / (1024**3) free = total_memory - allocated memory_info.update({ "gpu_name": props.name, "total_memory_gb": round(total_memory, 2), "allocated_memory_gb": round(allocated, 2), "reserved_memory_gb": round(reserved, 2), "free_memory_gb": round(free, 2), "utilization_percent": round((allocated / total_memory) * 100, 1) }) return memory_info @app.get("/debug") async def debug_info(): """Enhanced debug information""" model_device_info = {} if model: try: devices = set() for param in model.parameters(): devices.add(str(param.device)) break # Just check first parameter model_device_info = { "model_devices": list(devices), "device_consistent": len(devices) == 1, "first_param_device": str(next(model.parameters()).device) } except: model_device_info = {"error": "Could not get model device info"} memory_info = await memory_status() return { "model_status": { "model_loaded": model_loaded, "loading_error": loading_error, "model_type": type(model).__name__ if model else None, **model_device_info }, "system_info": { "target_device": device, "gpu_available": torch.cuda.is_available(), "torch_version": torch.__version__, "cuda_version": torch.version.cuda if torch.cuda.is_available() else None }, "memory_info": memory_info, "optimization_info": { "precision": "FP16", "quantization": "None", "flash_attention": "Enabled", "tf32": "Enabled", "lora_merged": "Yes" if model_loaded else "Unknown" }, "cache_info": { "hf_cache": CACHE_DIR, "cache_exists": os.path.exists(CACHE_DIR), "cache_writable": os.access(CACHE_DIR, os.W_OK) if os.path.exists(CACHE_DIR) else False } } if __name__ == "__main__": import uvicorn uvicorn.run("app:app", host="0.0.0.0", port=7860)