from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import numpy as np
import logging

# Проверка версии NumPy
assert np.__version__.startswith('1.'), f"Несовместимая версия NumPy: {np.__version__}"

app = FastAPI()

class RequestData(BaseModel):
    prompt: str
    max_tokens: int = 50

#MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#MODEL_NAME = "ai-forever/rugpt3small_based_on_gpt2"
MODEL_NAME = "TinyLlama/TinyLlama_v1.1"

try:
    # Загрузка модели с явным указанием device_map
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto",
        low_cpu_mem_usage=True
    )
    
    # Создаем pipeline без указания device
    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer
    )
except Exception as e:
    print(f"Ошибка загрузки модели: {str(e)}")
    generator = None


logging.basicConfig(level=logging.INFO)

@app.on_event("startup")
async def startup_event():
    routes = [route.path for route in app.routes]
    print(f"Registered routes: {routes}")

@app.get("/")
async def root_health_check():
    return {"status": "ok"}

@app.post("/generate")
async def generate_text(request: RequestData):
    if not generator:
        raise HTTPException(status_code=503, detail="Модель не загружена")
    
    try:
        output = generator(
            request.prompt,
            max_new_tokens=min(request.max_tokens, 100),
            do_sample=False,
            num_beams=1,
            temperature=0.7,
        )
        return {"response": output[0]["generated_text"]}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    return {"status": "ok" if generator else "unavailable"}