Files changed (4) hide show
  1. DOCKERFILE +33 -0
  2. download_mode.py +8 -0
  3. huggingface.yaml +1 -0
  4. main.py +7 -9
DOCKERFILE ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image with Python and llama-cpp dependencies
2
+ FROM python:3.11-slim
3
+
4
+ # System dependencies for llama-cpp
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ cmake \
8
+ wget \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Install Python packages
13
+ RUN pip install --no-cache-dir \
14
+ llama-cpp-python==0.2.66 \
15
+ fastapi \
16
+ uvicorn \
17
+ huggingface-hub
18
+
19
+ # Create app directory
20
+ WORKDIR /app
21
+ COPY . /app
22
+
23
+ # Download model from Hugging Face Hub (on container startup)
24
+ ENV MODEL_REPO=TheBloke/phi-2-GGUF
25
+ ENV MODEL_FILE=phi-2.Q4_K_M.gguf
26
+
27
+ # Create model loader script
28
+ RUN echo '#!/bin/bash\n'\
29
+ 'python download_model.py\n'\
30
+ 'uvicorn main:app --host 0.0.0.0 --port 7860' > entrypoint.sh && \
31
+ chmod +x entrypoint.sh
32
+
33
+ CMD ["./entrypoint.sh"]
download_mode.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ import os
3
+
4
+ repo_id = os.environ.get("MODEL_REPO")
5
+ filename = os.environ.get("MODEL_FILE")
6
+
7
+ print("Downloading model:", repo_id, filename)
8
+ hf_hub_download(repo_id=repo_id, filename=filename, local_dir=".", local_dir_use_symlinks=False)
huggingface.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ sdk: docker
main.py CHANGED
@@ -1,13 +1,15 @@
1
- from fastapi import FastAPI, Request
2
- from llama_cpp import Llama
3
  from pydantic import BaseModel
4
  from typing import List
5
- import uvicorn
6
 
7
  app = FastAPI()
8
 
9
- # Load small model (e.g., Phi-2 or DeepSeek)
10
- llm = Llama(model_path="phi-2.Q4_K_M.gguf", n_ctx=2048, n_threads=2)
 
 
 
11
 
12
  class Message(BaseModel):
13
  role: str
@@ -18,7 +20,6 @@ class ChatRequest(BaseModel):
18
  messages: List[Message]
19
  temperature: float = 0.7
20
  max_tokens: int = 256
21
- stream: bool = False
22
 
23
  @app.post("/v1/chat/completions")
24
  async def chat_completions(req: ChatRequest):
@@ -40,6 +41,3 @@ async def chat_completions(req: ChatRequest):
40
  }],
41
  "model": req.model
42
  }
43
-
44
- if __name__ == "__main__":
45
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ from fastapi import FastAPI
 
2
  from pydantic import BaseModel
3
  from typing import List
4
+ from llama_cpp import Llama
5
 
6
  app = FastAPI()
7
 
8
+ llm = Llama(
9
+ model_path="phi-2.Q4_K_M.gguf",
10
+ n_ctx=2048,
11
+ n_threads=2
12
+ )
13
 
14
  class Message(BaseModel):
15
  role: str
 
20
  messages: List[Message]
21
  temperature: float = 0.7
22
  max_tokens: int = 256
 
23
 
24
  @app.post("/v1/chat/completions")
25
  async def chat_completions(req: ChatRequest):
 
41
  }],
42
  "model": req.model
43
  }