|
|
|
""" |
|
Turkish Medical Model API - No Flash Attention Dependency |
|
Focus on LoRA loading with compiler support |
|
""" |
|
|
|
import os |
|
import shutil |
|
import logging |
|
import time |
|
import asyncio |
|
import gc |
|
from typing import Dict, Optional, List |
|
import json |
|
|
|
|
|
CACHE_DIR = "/tmp/hf_cache" |
|
TRITON_CACHE = "/tmp/triton_cache" |
|
|
|
|
|
os.environ["CC"] = "gcc" |
|
os.environ["CXX"] = "g++" |
|
|
|
|
|
os.environ["HF_HOME"] = CACHE_DIR |
|
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR |
|
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR |
|
os.environ["HF_HUB_CACHE"] = CACHE_DIR |
|
os.environ["TRITON_CACHE_DIR"] = TRITON_CACHE |
|
os.environ["CUDA_CACHE_PATH"] = "/tmp/cuda_cache" |
|
|
|
|
|
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" |
|
os.environ["CUDA_LAUNCH_BLOCKING"] = "0" |
|
|
|
|
|
os.environ["MAX_JOBS"] = "4" |
|
|
|
|
|
os.environ["TORCH_COMPILE"] = "0" |
|
os.environ["PYTORCH_COMPILE"] = "0" |
|
|
|
|
|
for cache_path in [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache"]: |
|
os.makedirs(cache_path, exist_ok=True) |
|
os.chmod(cache_path, 0o777) |
|
|
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig |
|
from peft import PeftModel |
|
from fastapi import FastAPI, HTTPException |
|
from fastapi.middleware.cors import CORSMiddleware |
|
from pydantic import BaseModel |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
tokenizer = None |
|
model = None |
|
generation_config = None |
|
model_loaded = False |
|
loading_error = None |
|
device = "cuda:0" |
|
lora_loaded = False |
|
|
|
|
|
class Message(BaseModel): |
|
role: str |
|
content: str |
|
|
|
class ChatRequest(BaseModel): |
|
message: str |
|
max_tokens: int = 200 |
|
temperature: float = 0.7 |
|
conversation_history: Optional[List[Message]] = [] |
|
|
|
class ConversationRequest(BaseModel): |
|
messages: List[Message] |
|
max_tokens: int = 200 |
|
temperature: float = 0.7 |
|
|
|
class ChatResponse(BaseModel): |
|
response: str |
|
generation_time: float |
|
tokens_generated: int |
|
conversation_turn: int |
|
|
|
class HealthResponse(BaseModel): |
|
status: str |
|
model_loaded: bool |
|
gpu_available: bool |
|
error: Optional[str] = None |
|
|
|
def check_compiler(): |
|
"""Check if C compiler is available""" |
|
try: |
|
import subprocess |
|
result = subprocess.run(['gcc', '--version'], capture_output=True, text=True) |
|
if result.returncode == 0: |
|
logger.info("✅ GCC compiler found") |
|
logger.info(f"🔧 GCC version: {result.stdout.split()[2]}") |
|
return True |
|
else: |
|
logger.error("❌ GCC compiler not found") |
|
return False |
|
except Exception as e: |
|
logger.error(f"❌ Compiler check failed: {e}") |
|
return False |
|
|
|
def setup_l4_optimization(): |
|
"""Setup optimizations specific to Nvidia L4 (without Flash Attention)""" |
|
if torch.cuda.is_available(): |
|
|
|
torch.cuda.set_per_process_memory_fraction(0.85) |
|
torch.backends.cuda.matmul.allow_tf32 = True |
|
torch.backends.cudnn.allow_tf32 = True |
|
|
|
logger.info("🎯 L4 optimizations enabled: TF32, Memory optimized") |
|
|
|
def clear_gpu_memory(): |
|
"""Optimized GPU memory cleanup for L4""" |
|
if torch.cuda.is_available(): |
|
torch.cuda.empty_cache() |
|
torch.cuda.synchronize() |
|
gc.collect() |
|
|
|
def setup_cache_directories(): |
|
"""Setup cache directories""" |
|
cache_dirs = [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache", "/tmp/.cache"] |
|
|
|
for cache_dir in cache_dirs: |
|
try: |
|
os.makedirs(cache_dir, exist_ok=True) |
|
os.chmod(cache_dir, 0o777) |
|
logger.info(f"✅ Created cache dir: {cache_dir}") |
|
except Exception as e: |
|
logger.warning(f"⚠️ Could not create {cache_dir}: {e}") |
|
|
|
def clear_cache_locks(): |
|
"""Clear cache locks""" |
|
try: |
|
all_cache_dirs = [CACHE_DIR, TRITON_CACHE, "/tmp/cuda_cache", "/tmp/.cache"] |
|
|
|
for cache_dir in all_cache_dirs: |
|
if os.path.exists(cache_dir): |
|
for root, dirs, files in os.walk(cache_dir): |
|
for file in files: |
|
if file.endswith('.lock') or file.endswith('.incomplete'): |
|
lock_file = os.path.join(root, file) |
|
try: |
|
os.remove(lock_file) |
|
except: |
|
pass |
|
except Exception as e: |
|
logger.warning(f"Could not clear cache locks: {e}") |
|
|
|
def format_medical_conversation(messages: List[Message]) -> str: |
|
"""Format conversation for Turkish medical context""" |
|
conversation = "Bu bir Türkçe hasta-doktor görüşmesidir. Doktor profesyonel, empatik ve tıbbi bilgiye dayalı yanıtlar verir.\n\n" |
|
|
|
for i, msg in enumerate(messages): |
|
if msg.role == "assistant": |
|
conversation += f"Doktor: {msg.content}\n" |
|
else: |
|
conversation += f"Hasta: {msg.content}\n" |
|
|
|
conversation += "Doktor:" |
|
return conversation |
|
|
|
def clean_medical_response(response: str) -> str: |
|
"""Clean and validate Turkish medical response""" |
|
|
|
response = response.strip() |
|
|
|
|
|
prefixes_to_remove = ["Doktor:", "Hasta:", "Assistant:", "Human:", "Dr.", "Patient:"] |
|
for prefix in prefixes_to_remove: |
|
if response.startswith(prefix): |
|
response = response[len(prefix):].strip() |
|
|
|
|
|
unwanted_patterns = [ |
|
"Hasta :", "Hasta:", "HASTA:", "Dokтор:", "DOKTOR:", "DİĞER HASTA", |
|
"DOKTÖR:", "(gülmeye başlıyor)", "(kıkırdayarak)", "arkada", "arkadan" |
|
] |
|
|
|
for pattern in unwanted_patterns: |
|
response = response.replace(pattern, "") |
|
|
|
|
|
sentences = response.split('.') |
|
clean_sentences = [] |
|
|
|
for sentence in sentences: |
|
sentence = sentence.strip() |
|
if (len(sentence) > 15 and |
|
not any(bad_word in sentence.lower() for bad_word in ["hasta", "gülme", "kıkırd", "arkada"])): |
|
clean_sentences.append(sentence) |
|
if len(clean_sentences) >= 2: |
|
break |
|
|
|
if clean_sentences: |
|
response = '. '.join(clean_sentences) |
|
if not response.endswith('.'): |
|
response += '.' |
|
else: |
|
response = "Bu konuda size yardımcı olmaya çalışayım. Lütfen belirtilerinizi daha detaylı anlatabilir misiniz?" |
|
|
|
return response |
|
|
|
async def load_model(): |
|
"""Load model with focus on LoRA compilation""" |
|
global tokenizer, model, generation_config, model_loaded, loading_error, lora_loaded |
|
|
|
if model_loaded: |
|
return True |
|
|
|
try: |
|
logger.info("🚀 Loading Turkish Medical Model - LoRA Focus...") |
|
|
|
|
|
compiler_available = check_compiler() |
|
if not compiler_available: |
|
logger.warning("⚠️ C compiler not available, LoRA compilation may fail") |
|
|
|
setup_cache_directories() |
|
clear_cache_locks() |
|
setup_l4_optimization() |
|
clear_gpu_memory() |
|
|
|
start_time = time.time() |
|
|
|
|
|
if torch.cuda.is_available(): |
|
props = torch.cuda.get_device_properties(0) |
|
total_memory = props.total_memory / (1024**3) |
|
logger.info(f"🎮 GPU: {props.name}") |
|
logger.info(f"🎮 Total VRAM: {total_memory:.1f}GB") |
|
|
|
|
|
logger.info("📚 Loading tokenizer...") |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"Conquerorr000/llama-3.1-8b-turkish-medical-lora", |
|
cache_dir=CACHE_DIR, |
|
trust_remote_code=True, |
|
token=HF_TOKEN, |
|
use_fast=True |
|
) |
|
|
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
|
logger.info("✅ Tokenizer loaded successfully") |
|
|
|
|
|
logger.info("🧠 Loading base model (FP16 - optimized for LoRA)...") |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
"meta-llama/Meta-Llama-3.1-8B-Instruct", |
|
cache_dir=CACHE_DIR, |
|
torch_dtype=torch.float16, |
|
device_map="auto", |
|
trust_remote_code=True, |
|
low_cpu_mem_usage=True, |
|
token=HF_TOKEN, |
|
attn_implementation="eager", |
|
use_cache=True, |
|
max_memory={0: "18GiB"} |
|
) |
|
|
|
logger.info("✅ Base model loaded (eager attention)") |
|
|
|
|
|
if torch.cuda.is_available(): |
|
allocated = torch.cuda.memory_allocated(0) / (1024**3) |
|
logger.info(f"🎮 Memory after base model: {allocated:.2f}GB") |
|
|
|
|
|
logger.info("🎯 Loading Turkish Medical LoRA adapter...") |
|
|
|
try: |
|
|
|
logger.info("🔧 Attempting LoRA compilation...") |
|
|
|
lora_model = PeftModel.from_pretrained( |
|
model, |
|
"Conquerorr000/llama-3.1-8b-turkish-medical-lora", |
|
cache_dir=CACHE_DIR, |
|
torch_dtype=torch.float16, |
|
token=HF_TOKEN, |
|
is_trainable=False, |
|
device_map="auto" |
|
) |
|
|
|
logger.info("✅ Turkish Medical LoRA adapter loaded successfully!") |
|
lora_loaded = True |
|
|
|
|
|
logger.info("🔗 Attempting to merge LoRA adapter...") |
|
try: |
|
|
|
if torch.cuda.is_available(): |
|
free_memory = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0) |
|
free_gb = free_memory / (1024**3) |
|
logger.info(f"🎮 Free memory for merge: {free_gb:.2f}GB") |
|
|
|
if free_gb > 3.0: |
|
model = lora_model.merge_and_unload() |
|
logger.info("✅ Turkish Medical LoRA merged successfully!") |
|
else: |
|
logger.info("📝 Using LoRA as adapter (insufficient memory for merge)") |
|
model = lora_model |
|
|
|
except Exception as merge_error: |
|
logger.warning(f"⚠️ LoRA merge failed: {merge_error}") |
|
logger.info("📝 Using Turkish Medical LoRA as adapter") |
|
model = lora_model |
|
|
|
except Exception as lora_error: |
|
logger.error(f"❌ Turkish Medical LoRA loading failed: {lora_error}") |
|
|
|
|
|
logger.info("🔄 Trying alternative LoRA loading...") |
|
|
|
try: |
|
|
|
lora_model = PeftModel.from_pretrained( |
|
model, |
|
"Conquerorr000/llama-3.1-8b-turkish-medical-lora", |
|
cache_dir=CACHE_DIR, |
|
torch_dtype=torch.float16, |
|
token=HF_TOKEN, |
|
is_trainable=False, |
|
device_map=None |
|
) |
|
|
|
|
|
lora_model = lora_model.to(device) |
|
model = lora_model |
|
lora_loaded = True |
|
logger.info("✅ Turkish Medical LoRA loaded with alternative method!") |
|
|
|
except Exception as alt_error: |
|
logger.error(f"❌ Alternative LoRA loading also failed: {alt_error}") |
|
logger.error("❌ CRITICAL: Model will not have Turkish medical fine-tuning!") |
|
loading_error = f"LoRA loading failed: {str(lora_error)}" |
|
lora_loaded = False |
|
|
|
|
|
generation_config = GenerationConfig( |
|
max_new_tokens=150, |
|
temperature=0.7, |
|
top_p=0.9, |
|
top_k=50, |
|
do_sample=True, |
|
pad_token_id=tokenizer.pad_token_id, |
|
eos_token_id=tokenizer.eos_token_id, |
|
repetition_penalty=1.15, |
|
no_repeat_ngram_size=3, |
|
use_cache=True |
|
) |
|
|
|
|
|
model.eval() |
|
|
|
|
|
clear_gpu_memory() |
|
|
|
loading_time = time.time() - start_time |
|
logger.info(f"✅ Model loading completed in {loading_time:.2f}s") |
|
|
|
|
|
if torch.cuda.is_available(): |
|
total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) |
|
allocated = torch.cuda.memory_allocated(0) / (1024**3) |
|
free = total_memory - allocated |
|
logger.info(f"🎮 Final Memory: Allocated={allocated:.2f}GB, Free={free:.2f}GB") |
|
|
|
status = "✅ TURKISH MEDICAL MODEL READY" if lora_loaded else "⚠️ BASE MODEL ONLY (NO MEDICAL TRAINING)" |
|
logger.info(status) |
|
|
|
model_loaded = True |
|
return True |
|
|
|
except Exception as e: |
|
error_msg = f"Model loading failed: {str(e)}" |
|
logger.error(f"❌ {error_msg}") |
|
loading_error = error_msg |
|
model_loaded = False |
|
clear_gpu_memory() |
|
return False |
|
|
|
async def generate_response(messages: List[Message], max_tokens: int = 200, temperature: float = 0.7) -> Dict: |
|
"""Generate Turkish medical response""" |
|
global model, tokenizer, generation_config |
|
|
|
if not model_loaded: |
|
raise HTTPException(status_code=503, detail="Model not loaded") |
|
|
|
try: |
|
start_time = time.time() |
|
|
|
|
|
conversation_text = format_medical_conversation(messages) |
|
|
|
|
|
inputs = tokenizer( |
|
conversation_text, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=1024, |
|
add_special_tokens=True |
|
) |
|
|
|
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
gen_config = GenerationConfig( |
|
max_new_tokens=min(max_tokens, 150), |
|
temperature=temperature, |
|
top_p=0.9, |
|
top_k=50, |
|
do_sample=True, |
|
pad_token_id=tokenizer.pad_token_id, |
|
eos_token_id=tokenizer.eos_token_id, |
|
repetition_penalty=1.15, |
|
no_repeat_ngram_size=3, |
|
use_cache=True |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
input_ids=inputs["input_ids"], |
|
attention_mask=inputs["attention_mask"], |
|
generation_config=gen_config, |
|
use_cache=True |
|
) |
|
|
|
|
|
input_length = inputs["input_ids"].shape[1] |
|
generated_ids = outputs[0][input_length:] |
|
generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True) |
|
|
|
|
|
response = clean_medical_response(generated_text) |
|
|
|
generation_time = time.time() - start_time |
|
|
|
|
|
del outputs, generated_ids |
|
torch.cuda.empty_cache() |
|
|
|
return { |
|
"response": response, |
|
"generation_time": round(generation_time, 3), |
|
"tokens_generated": len(generated_text.split()), |
|
"conversation_turn": len(messages) + 1, |
|
"lora_active": lora_loaded |
|
} |
|
|
|
except Exception as e: |
|
logger.error(f"Generation error: {e}") |
|
torch.cuda.empty_cache() |
|
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") |
|
|
|
|
|
app = FastAPI( |
|
title="Turkish Medical Model API", |
|
description="Turkish medical conversation model - Stable build", |
|
version="2.1.1" |
|
) |
|
|
|
app.add_middleware( |
|
CORSMiddleware, |
|
allow_origins=["*"], |
|
allow_credentials=True, |
|
allow_methods=["*"], |
|
allow_headers=["*"], |
|
) |
|
|
|
@app.on_event("startup") |
|
async def startup_event(): |
|
"""Load model on startup""" |
|
logger.info("🚀 Starting API server - Stable build...") |
|
logger.info(f"📁 HF Cache: {CACHE_DIR}") |
|
logger.info(f"🎮 Target GPU: Nvidia L4 24GB") |
|
logger.info(f"💾 Mode: FP16 + Turkish Medical LoRA") |
|
logger.info(f"🔧 Compiler: {os.environ.get('CC', 'Not Set')}") |
|
logger.info(f"⚡ Attention: Eager (stable)") |
|
|
|
if HF_TOKEN: |
|
logger.info("✅ HF Token found") |
|
else: |
|
logger.info("ℹ️ No HF Token") |
|
|
|
setup_cache_directories() |
|
clear_cache_locks() |
|
setup_l4_optimization() |
|
|
|
|
|
asyncio.create_task(load_model()) |
|
|
|
@app.get("/", response_model=HealthResponse) |
|
async def root(): |
|
return HealthResponse( |
|
status="healthy" if model_loaded else "loading", |
|
model_loaded=model_loaded, |
|
gpu_available=torch.cuda.is_available(), |
|
error=loading_error |
|
) |
|
|
|
@app.get("/health", response_model=HealthResponse) |
|
async def health_check(): |
|
return HealthResponse( |
|
status="healthy" if (model_loaded and lora_loaded) else "degraded" if model_loaded else "loading", |
|
model_loaded=model_loaded, |
|
gpu_available=torch.cuda.is_available(), |
|
error=loading_error |
|
) |
|
|
|
@app.post("/chat", response_model=ChatResponse) |
|
async def chat_endpoint(request: ChatRequest): |
|
"""Turkish medical chat endpoint""" |
|
try: |
|
messages = request.conversation_history or [] |
|
messages.append(Message(role="user", content=request.message)) |
|
|
|
result = await generate_response( |
|
messages, |
|
request.max_tokens, |
|
request.temperature |
|
) |
|
|
|
return ChatResponse( |
|
response=result["response"], |
|
generation_time=result["generation_time"], |
|
tokens_generated=result["tokens_generated"], |
|
conversation_turn=result["conversation_turn"] |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"Chat error: {e}") |
|
raise HTTPException(status_code=500, detail=f"Chat failed: {str(e)}") |
|
|
|
@app.post("/conversation", response_model=ChatResponse) |
|
async def conversation_endpoint(request: ConversationRequest): |
|
"""Turkish medical conversation endpoint""" |
|
try: |
|
result = await generate_response( |
|
request.messages, |
|
request.max_tokens, |
|
request.temperature |
|
) |
|
|
|
return ChatResponse( |
|
response=result["response"], |
|
generation_time=result["generation_time"], |
|
tokens_generated=result["tokens_generated"], |
|
conversation_turn=result["conversation_turn"] |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"Conversation error: {e}") |
|
raise HTTPException(status_code=500, detail=f"Conversation failed: {str(e)}") |
|
|
|
@app.get("/test") |
|
async def test_endpoint(): |
|
"""Turkish medical test""" |
|
if not model_loaded: |
|
return { |
|
"status": "model_not_ready", |
|
"message": "Model is still loading...", |
|
"error": loading_error |
|
} |
|
|
|
try: |
|
test_messages = [ |
|
Message(role="user", content="Merhaba doktor, 2 gündür başım ağrıyor ve ateşim var.") |
|
] |
|
|
|
result = await generate_response(test_messages, 150, 0.7) |
|
|
|
return { |
|
"status": "success", |
|
"test_input": test_messages[0].content, |
|
"test_output": result["response"], |
|
"generation_time": result["generation_time"], |
|
"device_info": device, |
|
"lora_active": result.get("lora_active", False), |
|
"model_type": "Turkish Medical LoRA" if lora_loaded else "Base Llama (NO MEDICAL TRAINING)" |
|
} |
|
except Exception as e: |
|
logger.error(f"Test error: {e}") |
|
return { |
|
"status": "error", |
|
"message": f"Test failed: {str(e)}" |
|
} |
|
|
|
@app.get("/memory-status") |
|
async def memory_status(): |
|
"""Get GPU memory status""" |
|
memory_info = {"gpu_available": torch.cuda.is_available()} |
|
|
|
if torch.cuda.is_available(): |
|
props = torch.cuda.get_device_properties(0) |
|
total_memory = props.total_memory / (1024**3) |
|
allocated = torch.cuda.memory_allocated(0) / (1024**3) |
|
reserved = torch.cuda.memory_reserved(0) / (1024**3) |
|
free = total_memory - allocated |
|
|
|
memory_info.update({ |
|
"gpu_name": props.name, |
|
"total_memory_gb": round(total_memory, 2), |
|
"allocated_memory_gb": round(allocated, 2), |
|
"reserved_memory_gb": round(reserved, 2), |
|
"free_memory_gb": round(free, 2), |
|
"utilization_percent": round((allocated / total_memory) * 100, 1) |
|
}) |
|
|
|
return memory_info |
|
|
|
@app.get("/debug") |
|
async def debug_info(): |
|
"""Enhanced debug information""" |
|
model_device_info = {} |
|
if model: |
|
try: |
|
devices = set() |
|
for param in model.parameters(): |
|
devices.add(str(param.device)) |
|
break |
|
|
|
model_device_info = { |
|
"model_devices": list(devices), |
|
"device_consistent": len(devices) == 1, |
|
"first_param_device": str(next(model.parameters()).device) |
|
} |
|
except: |
|
model_device_info = {"error": "Could not get model device info"} |
|
|
|
memory_info = await memory_status() |
|
|
|
return { |
|
"model_status": { |
|
"model_loaded": model_loaded, |
|
"lora_loaded": lora_loaded, |
|
"loading_error": loading_error, |
|
"model_type": type(model).__name__ if model else None, |
|
**model_device_info |
|
}, |
|
"system_info": { |
|
"target_device": device, |
|
"gpu_available": torch.cuda.is_available(), |
|
"torch_version": torch.__version__, |
|
"cuda_version": torch.version.cuda if torch.cuda.is_available() else None, |
|
"compiler": os.environ.get("CC", "Not Set") |
|
}, |
|
"memory_info": memory_info, |
|
"optimization_info": { |
|
"precision": "FP16", |
|
"quantization": "None", |
|
"flash_attention": "Disabled (stable build)", |
|
"tf32": "Enabled", |
|
"lora_status": "Loaded" if lora_loaded else "FAILED - NO MEDICAL TRAINING", |
|
"medical_fine_tuning": "Active" if lora_loaded else "MISSING" |
|
}, |
|
"cache_info": { |
|
"hf_cache": CACHE_DIR, |
|
"cache_exists": os.path.exists(CACHE_DIR), |
|
"cache_writable": os.access(CACHE_DIR, os.W_OK) if os.path.exists(CACHE_DIR) else False |
|
} |
|
} |
|
|
|
if __name__ == "__main__": |
|
import uvicorn |
|
uvicorn.run("app:app", host="0.0.0.0", port=7860) |