Spaces:
Runtime error
Runtime error
#!/usr/bin/env python3 | |
""" | |
DeepCoder Model API Server | |
Serves the DeepCoder-14B model via FastAPI | |
""" | |
import os | |
import asyncio | |
import logging | |
from typing import Optional, Dict, Any | |
import uvicorn | |
from fastapi import FastAPI, HTTPException, BackgroundTasks | |
from fastapi.middleware.cors import CORSMiddleware | |
from pydantic import BaseModel, Field | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from huggingface_hub import hf_hub_download | |
import json | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Configuration | |
MODEL_NAME = os.getenv("MODEL_NAME", "ai/deepcoder-preview") | |
MODEL_VARIANT = os.getenv("MODEL_VARIANT", "14B-Q4_K_M") | |
CACHE_DIR = os.getenv("HUGGINGFACE_HUB_CACHE", "/app/cache") | |
MAX_TOKENS = 131072 # 131K context length | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
app = FastAPI( | |
title="DeepCoder API", | |
description="AI Code Generation Model API", | |
version="1.0.0" | |
) | |
# CORS middleware | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# Global model variables | |
tokenizer = None | |
model = None | |
model_loaded = False | |
class CodeRequest(BaseModel): | |
prompt: str = Field(..., description="Code generation prompt") | |
temperature: float = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature") | |
top_p: float = Field(0.95, ge=0.0, le=1.0, description="Top-p sampling") | |
max_tokens: int = Field(2048, ge=1, le=8192, description="Maximum tokens to generate") | |
stop_sequences: Optional[list] = Field(None, description="Stop sequences") | |
class CodeResponse(BaseModel): | |
generated_code: str | |
model_info: Dict[str, Any] | |
generation_params: Dict[str, Any] | |
async def load_model(): | |
"""Load the DeepCoder model and tokenizer""" | |
global tokenizer, model, model_loaded | |
if model_loaded: | |
return | |
try: | |
logger.info(f"Loading model: {MODEL_NAME}") | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained( | |
MODEL_NAME, | |
cache_dir=CACHE_DIR, | |
trust_remote_code=True | |
) | |
# Load model with appropriate settings for the quantized version | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, | |
cache_dir=CACHE_DIR, | |
trust_remote_code=True, | |
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, | |
device_map="auto" if DEVICE == "cuda" else None, | |
load_in_4bit=True if "Q4" in MODEL_VARIANT else False, | |
) | |
if DEVICE == "cpu" and hasattr(model, 'to'): | |
model = model.to(DEVICE) | |
model_loaded = True | |
logger.info(f"Model loaded successfully on {DEVICE}") | |
except Exception as e: | |
logger.error(f"Error loading model: {str(e)}") | |
raise | |
async def startup_event(): | |
"""Load model on startup""" | |
await load_model() | |
async def root(): | |
return { | |
"message": "DeepCoder API", | |
"model": MODEL_NAME, | |
"variant": MODEL_VARIANT, | |
"status": "ready" if model_loaded else "loading" | |
} | |
async def health_check(): | |
return { | |
"status": "healthy" if model_loaded else "loading", | |
"model_loaded": model_loaded, | |
"device": DEVICE, | |
"gpu_available": torch.cuda.is_available() | |
} | |
async def model_info(): | |
"""Get model information""" | |
if not model_loaded: | |
raise HTTPException(status_code=503, detail="Model not loaded yet") | |
return { | |
"model_name": MODEL_NAME, | |
"variant": MODEL_VARIANT, | |
"max_context_length": MAX_TOKENS, | |
"device": DEVICE, | |
"model_size": "14B parameters", | |
"quantization": "Q4_K_M" if "Q4" in MODEL_VARIANT else "None", | |
"benchmarks": { | |
"LiveCodeBench_v5_Pass@1": "60.6%", | |
"Codeforces_Elo": 1936, | |
"Codeforces_Percentile": "95.3", | |
"HumanEval+_Accuracy": "92.6%" | |
} | |
} | |
async def generate_code(request: CodeRequest): | |
"""Generate code using the DeepCoder model""" | |
if not model_loaded: | |
raise HTTPException(status_code=503, detail="Model not loaded yet") | |
try: | |
# Tokenize input | |
inputs = tokenizer( | |
request.prompt, | |
return_tensors="pt", | |
truncation=True, | |
max_length=MAX_TOKENS - request.max_tokens | |
) | |
if DEVICE == "cuda": | |
inputs = {k: v.to(DEVICE) for k, v in inputs.items()} | |
# Generation parameters | |
generation_kwargs = { | |
"max_new_tokens": request.max_tokens, | |
"temperature": request.temperature, | |
"top_p": request.top_p, | |
"do_sample": True, | |
"pad_token_id": tokenizer.eos_token_id, | |
} | |
if request.stop_sequences: | |
generation_kwargs["stop_sequences"] = request.stop_sequences | |
# Generate | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, **generation_kwargs) | |
# Decode output | |
generated_tokens = outputs[0][inputs["input_ids"].shape[1]:] | |
generated_code = tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
return CodeResponse( | |
generated_code=generated_code, | |
model_info={ | |
"model_name": MODEL_NAME, | |
"variant": MODEL_VARIANT, | |
"device": DEVICE | |
}, | |
generation_params={ | |
"temperature": request.temperature, | |
"top_p": request.top_p, | |
"max_tokens": request.max_tokens | |
} | |
) | |
except Exception as e: | |
logger.error(f"Generation error: {str(e)}") | |
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") | |
async def chat_completion(request: CodeRequest): | |
"""Chat-style completion for code assistance""" | |
# Add system context for better code generation | |
system_prompt = """You are DeepCoder, an expert AI programming assistant. Generate high-quality, well-commented code that follows best practices.""" | |
full_prompt = f"{system_prompt}\n\nUser: {request.prompt}\n\nAssistant:" | |
# Create modified request with system prompt | |
modified_request = CodeRequest( | |
prompt=full_prompt, | |
temperature=request.temperature, | |
top_p=request.top_p, | |
max_tokens=request.max_tokens, | |
stop_sequences=request.stop_sequences | |
) | |
return await generate_code(modified_request) | |
if __name__ == "__main__": | |
uvicorn.run( | |
"app:app", | |
host="0.0.0.0", | |
port=8000, | |
reload=False, | |
log_level="info" | |
) | |