from fastapi import FastAPI, HTTPException, Request, Response from pydantic import BaseModel from typing import List, Optional from llama_cpp import Llama from fastapi.responses import PlainTextResponse, JSONResponse from starlette.middleware.base import BaseHTTPMiddleware import logging import json import os import time import uuid # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger("api_logger") class LoggingMiddleware(BaseHTTPMiddleware): async def dispatch(self, request: Request, call_next): # Read request body (must be buffered manually) body = await request.body() logger.info(f"REQUEST: {request.method} {request.url}\nBody: {body.decode('utf-8')}") # Rebuild the request with body for downstream handlers request = Request(request.scope, receive=lambda: {'type': 'http.request', 'body': body}) # Process the response response = await call_next(request) response_body = b"" async for chunk in response.body_iterator: response_body += chunk # Log response body and status code logger.info(f"RESPONSE: Status {response.status_code}\nBody: {response_body.decode('utf-8')}") # Rebuild response to preserve original functionality return Response( content=response_body, status_code=response.status_code, headers=dict(response.headers), media_type=response.media_type ) # FastAPI app with middleware app = FastAPI() app.add_middleware(LoggingMiddleware) llm = None # Models class Message(BaseModel): role: str content: str class ChatRequest(BaseModel): model: str messages: List[Message] temperature: Optional[float] = 0.7 max_tokens: Optional[int] = 256 class GenerateRequest(BaseModel): model: str prompt: str max_tokens: Optional[int] = 256 temperature: Optional[float] = 0.7 class ModelInfo(BaseModel): id: str object: str type: str publisher: str arch: str compatibility_type: str quantization: str state: str max_context_length: int AVAILABLE_MODELS = [ ModelInfo( id="codellama-7b-instruct", object="model", type="llm", publisher="lmstudio-community", arch="llama", compatibility_type="gguf", quantization="Q4_K_M", state="loaded", max_context_length=32768 ) ] @app.on_event("startup") def load_model(): global llm model_path_file = "/tmp/model_path.txt" if not os.path.exists(model_path_file): raise RuntimeError(f"Model path file not found: {model_path_file}") with open(model_path_file, "r") as f: model_path = f.read().strip() if not os.path.exists(model_path): raise RuntimeError(f"Model not found at path: {model_path}") llm = Llama(model_path=model_path) @app.get("/", response_class=PlainTextResponse) async def root(): return "Ollama is running" @app.get("/health") async def health_check(): return {"status": "ok"} @app.get("/api/tags") async def api_tags(): return JSONResponse(content={ "data": [model.dict() for model in AVAILABLE_MODELS] }) @app.get("/models") async def list_models(): # Return available models info return [model.dict() for model in AVAILABLE_MODELS] @app.get("/api/v0/models") async def api_models(): return {"data": [model.dict() for model in AVAILABLE_MODELS]} @app.get("/models/{model_id}") async def get_model(model_id: str): for model in AVAILABLE_MODELS: if model.id == model_id: return model.dict() raise HTTPException(status_code=404, detail="Model not found") @app.post("/chat") async def chat(req: ChatRequest): global llm if llm is None: return {"error": "Model not initialized."} # Validate model - simple check if req.model not in [m.id for m in AVAILABLE_MODELS]: raise HTTPException(status_code=400, detail="Unsupported model") # Construct prompt from messages prompt = "" for m in req.messages: prompt += f"{m.role}: {m.content}\n" prompt += "assistant:" output = llm( prompt, max_tokens=req.max_tokens, temperature=req.temperature, stop=["user:", "assistant:"] ) text = output.get("choices", [{}])[0].get("text", "").strip() response = { "id": str(uuid.uuid4()), "model": req.model, "choices": [ { "message": {"role": "assistant", "content": text}, "finish_reason": "stop" } ] } return response @app.post("/api/v0/generate") async def api_generate(req: GenerateRequest): global llm if llm is None: raise HTTPException(status_code=503, detail="Model not initialized") if req.model not in [m.id for m in AVAILABLE_MODELS]: raise HTTPException(status_code=400, detail="Unsupported model") output = llm( req.prompt, max_tokens=req.max_tokens, temperature=req.temperature, stop=["\n\n"] # Or any stop sequence you want ) text = output.get("choices", [{}])[0].get("text", "").strip() return { "id": str(uuid.uuid4()), "model": req.model, "choices": [ { "text": text, "index": 0, "finish_reason": "stop" } ] }