|
import os |
|
import threading |
|
import uvicorn |
|
from fastapi import FastAPI |
|
from fastapi.responses import HTMLResponse, JSONResponse |
|
from pydantic import BaseModel |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
from peft import PeftModel |
|
import torch |
|
from huggingface_hub import hf_hub_download |
|
import zipfile |
|
from datetime import datetime |
|
import random |
|
|
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
MODEL_BASE = "UcsTurkey/kanarya-750m-fixed" |
|
FINE_TUNE_ZIP = "trained_model_000_000.zip" |
|
FINE_TUNE_REPO = "UcsTurkey/trained-zips" |
|
CONFIDENCE_THRESHOLD = -1.5 |
|
USE_SAMPLING = False |
|
FALLBACK_ANSWERS = [ |
|
"Bu konuda maalesef bilgim yok.", |
|
"Ne demek istediğinizi tam anlayamadım.", |
|
"Bu soruya şu an yanıt veremiyorum." |
|
] |
|
|
|
def log(message): |
|
timestamp = datetime.now().strftime("%H:%M:%S") |
|
try: |
|
print(f"[{timestamp}] {message}") |
|
except UnicodeEncodeError: |
|
safe_message = message.encode("utf-8", errors="replace").decode("utf-8", errors="ignore") |
|
print(f"[{timestamp}] {safe_message}") |
|
os.sys.stdout.flush() |
|
|
|
app = FastAPI() |
|
chat_history = [] |
|
model = None |
|
tokenizer = None |
|
|
|
class Message(BaseModel): |
|
user_input: str |
|
|
|
def detect_environment(): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
supports_bfloat16 = False |
|
gpu_name = "Yok" |
|
if device == "cuda": |
|
props = torch.cuda.get_device_properties(0) |
|
gpu_name = props.name |
|
major, _ = torch.cuda.get_device_capability(0) |
|
supports_bfloat16 = major >= 8 |
|
return { |
|
"device": device, |
|
"gpu_name": gpu_name, |
|
"supports_bfloat16": supports_bfloat16, |
|
"expected_config": { |
|
"gpu": "Nvidia A100", "min_vram": "16GB", "cpu": "8 vCPU" |
|
} |
|
} |
|
|
|
@app.get("/") |
|
def health(): |
|
return {"status": "ok"} |
|
|
|
@app.get("/status") |
|
def status(): |
|
env = detect_environment() |
|
return { |
|
"device": env["device"], |
|
"gpu": env["gpu_name"], |
|
"supports_bfloat16": env["supports_bfloat16"], |
|
"expected_config": env["expected_config"], |
|
"note": "Sistem bu bilgilerle çalışıyor. bfloat16 desteklenmiyorsa performans sınırlı olabilir." |
|
} |
|
|
|
@app.get("/start", response_class=HTMLResponse) |
|
def root(): |
|
return """ |
|
<html> |
|
<head><title>Fine-Tune Chat</title></head> |
|
<body> |
|
<h2>Fine-tune Chat Test</h2> |
|
<textarea id=\"input\" rows=\"4\" cols=\"60\" placeholder=\"Bir şeyler yaz...\"></textarea><br><br> |
|
<button onclick=\"send()\">Gönder</button> |
|
<pre id=\"output\"></pre> |
|
<script> |
|
async function send() { |
|
const input = document.getElementById(\"input\").value; |
|
const res = await fetch('/chat', { |
|
method: 'POST', |
|
headers: { 'Content-Type': 'application/json' }, |
|
body: JSON.stringify({ user_input: input }) |
|
}); |
|
const data = await res.json(); |
|
document.getElementById('output').innerText = data.answer || data.error || 'Hata oluştu.'; |
|
} |
|
</script> |
|
</body> |
|
</html> |
|
""" |
|
|
|
@app.post("/chat") |
|
def chat(msg: Message): |
|
try: |
|
log(f"Kullanıcı mesajı alındı: {msg}") |
|
global model, tokenizer |
|
if model is None or tokenizer is None: |
|
log("Hata: Model henüz yüklenmedi.") |
|
return {"error": "Model yüklenmedi. Lütfen birkaç saniye sonra tekrar deneyin."} |
|
user_input = msg.user_input.strip() |
|
if not user_input: |
|
return {"error": "Boş giriş"} |
|
full_prompt = f"SORU: {user_input}\nCEVAP:" |
|
log(f"Prompt: {full_prompt}") |
|
inputs = tokenizer(full_prompt, return_tensors="pt") |
|
inputs = {k: v.to(model.device) for k, v in inputs.items()} |
|
log(f"Tokenizer input_ids: {inputs['input_ids']}") |
|
log(f"input shape: {inputs['input_ids'].shape}") |
|
with torch.no_grad(): |
|
if USE_SAMPLING: |
|
output = model.generate( |
|
**inputs, |
|
max_new_tokens=100, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_k=50, |
|
top_p=0.95, |
|
return_dict_in_generate=True, |
|
output_scores=True, |
|
suppress_tokens=[tokenizer.pad_token_id] if tokenizer.pad_token_id else None |
|
) |
|
else: |
|
output = model.generate( |
|
**inputs, |
|
max_new_tokens=100, |
|
do_sample=False, |
|
return_dict_in_generate=True, |
|
output_scores=True, |
|
suppress_tokens=[tokenizer.pad_token_id] if tokenizer.pad_token_id else None |
|
) |
|
generated_ids = output.sequences[0] |
|
generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True) |
|
answer = generated_text[len(full_prompt):].strip() |
|
if output.scores and len(output.scores) > 0: |
|
first_token_logit = output.scores[0][0] |
|
if torch.isnan(first_token_logit).any() or torch.isinf(first_token_logit).any(): |
|
log("Geçersiz logit (NaN/Inf) tespit edildi, fallback cevabı gönderiliyor.") |
|
return {"answer": random.choice(FALLBACK_ANSWERS), "chat_history": chat_history} |
|
top_logit_score = torch.max(first_token_logit).item() |
|
log(f"İlk token logit skoru: {top_logit_score:.4f}") |
|
if top_logit_score < CONFIDENCE_THRESHOLD: |
|
fallback = random.choice(FALLBACK_ANSWERS) |
|
log(f"Düşük güven: fallback cevabı gönderiliyor: {fallback}") |
|
answer = fallback |
|
chat_history.append({"user": user_input, "bot": answer}) |
|
log(f"Soru: {user_input} → Yanıt: {answer[:60]}...") |
|
return {"answer": answer, "chat_history": chat_history} |
|
except Exception as e: |
|
log(f"/chat sırasında hata oluştu: {e}") |
|
return {"error": str(e)} |
|
|
|
def setup_model(): |
|
try: |
|
global model, tokenizer |
|
log("Fine-tune zip indiriliyor...") |
|
zip_path = hf_hub_download( |
|
repo_id=FINE_TUNE_REPO, |
|
filename=FINE_TUNE_ZIP, |
|
repo_type="model", |
|
token=HF_TOKEN |
|
) |
|
extract_dir = "/app/extracted" |
|
os.makedirs(extract_dir, exist_ok=True) |
|
with zipfile.ZipFile(zip_path, "r") as zip_ref: |
|
zip_ref.extractall(extract_dir) |
|
log("Zip başarıyla açıldı.") |
|
tokenizer = AutoTokenizer.from_pretrained(os.path.join(extract_dir, "output")) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
env = detect_environment() |
|
device = env["device"] |
|
dtype = torch.bfloat16 if env["supports_bfloat16"] else (torch.float16 if device == "cuda" else torch.float32) |
|
log(f"Ortam: GPU = {env['gpu_name']}, Device = {device}, bfloat16 destekleniyor mu: {env['supports_bfloat16']}") |
|
log(f"Model {device.upper()} üzerinde {dtype} precision ile yüklenecek.") |
|
log("Beklenen minimum sistem konfigürasyonu:") |
|
log(f"- GPU: {env['expected_config']['gpu']}") |
|
log(f"- GPU Bellek: {env['expected_config']['min_vram']}") |
|
log(f"- CPU: {env['expected_config']['cpu']}") |
|
base_model = AutoModelForCausalLM.from_pretrained(MODEL_BASE, torch_dtype=dtype).to(device) |
|
peft_model = PeftModel.from_pretrained(base_model, os.path.join(extract_dir, "output")) |
|
model = peft_model.model.to(device) |
|
model.eval() |
|
log(f"Model başarıyla yüklendi. dtype={next(model.parameters()).dtype}, device={next(model.parameters()).device}") |
|
except Exception as e: |
|
log(f"setup_model() sırasında hata oluştu: {e}") |
|
|
|
def run_server(): |
|
log("Uvicorn sunucusu başlatılıyor...") |
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
|
log("===== Application Startup =====") |
|
threading.Thread(target=setup_model, daemon=True).start() |
|
threading.Thread(target=run_server, daemon=True).start() |
|
log("Model yükleniyor, istekler ve API sunucusu hazırlanıyor...") |
|
while True: |
|
try: |
|
import time |
|
time.sleep(60) |
|
except Exception as e: |
|
log(f"Ana bekleme döngüsünde hata: {e}") |
|
|