Spaces:

Diamanta
/

JBAIP

Sleeping

Create huggingface.yaml

by Diamanta - opened Jun 1

←

Files changed (4) hide show

DOCKERFILE ADDED Viewed

+# Base image with Python and llama-cpp dependencies
+FROM python:3.11-slim
+# System dependencies for llama-cpp
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    wget \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python packages
+RUN pip install --no-cache-dir \
+    llama-cpp-python==0.2.66 \
+    fastapi \
+    uvicorn \
+    huggingface-hub
+# Create app directory
+WORKDIR /app
+COPY . /app
+# Download model from Hugging Face Hub (on container startup)
+ENV MODEL_REPO=TheBloke/phi-2-GGUF
+ENV MODEL_FILE=phi-2.Q4_K_M.gguf
+# Create model loader script
+RUN echo '#!/bin/bash\n'\
+'python download_model.py\n'\
+'uvicorn main:app --host 0.0.0.0 --port 7860' > entrypoint.sh && \
+chmod +x entrypoint.sh
+CMD ["./entrypoint.sh"]

download_mode.py ADDED Viewed

+from huggingface_hub import hf_hub_download
+import os
+repo_id = os.environ.get("MODEL_REPO")
+filename = os.environ.get("MODEL_FILE")
+print("Downloading model:", repo_id, filename)
+hf_hub_download(repo_id=repo_id, filename=filename, local_dir=".", local_dir_use_symlinks=False)

huggingface.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ sdk: docker

main.py CHANGED Viewed

@@ -1,13 +1,15 @@
-from fastapi import FastAPI, Request
-from llama_cpp import Llama
 from pydantic import BaseModel
 from typing import List
-import uvicorn
 app = FastAPI()
-# Load small model (e.g., Phi-2 or DeepSeek)
-llm = Llama(model_path="phi-2.Q4_K_M.gguf", n_ctx=2048, n_threads=2)
 class Message(BaseModel):
     role: str
@@ -18,7 +20,6 @@ class ChatRequest(BaseModel):
     messages: List[Message]
     temperature: float = 0.7
     max_tokens: int = 256
-    stream: bool = False
 @app.post("/v1/chat/completions")
 async def chat_completions(req: ChatRequest):
@@ -40,6 +41,3 @@ async def chat_completions(req: ChatRequest):
         }],
         "model": req.model
     }
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+from fastapi import FastAPI
 from pydantic import BaseModel
 from typing import List
+from llama_cpp import Llama
 app = FastAPI()
+llm = Llama(
+    model_path="phi-2.Q4_K_M.gguf",
+    n_ctx=2048,
+    n_threads=2
+)
 class Message(BaseModel):
     role: str
     messages: List[Message]
     temperature: float = 0.7
     max_tokens: int = 256
 @app.post("/v1/chat/completions")
 async def chat_completions(req: ChatRequest):
         }],
         "model": req.model
     }