from fastapi import FastAPI from pydantic import BaseModel import torch from transformers import AutoModelForCausalLM, AutoTokenizer app = FastAPI(title="DeepSeek R1 Distill Llama API") # Load model and tokenizer model_name = "ai/deepseek-r1-distill-llama" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" ) class Request(BaseModel): prompt: str max_length: int = 100 @app.post("/generate") def generate_text(request: Request): inputs = tokenizer(request.prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_length=request.max_length ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"generated_text": generated_text} @app.get("/") def home(): return {"message": "DeepSeek R1 API is running!"}