Spaces:
Sleeping
Sleeping
File size: 1,828 Bytes
f527df5 96e2315 f527df5 96e2315 f527df5 7860305 96e2315 f527df5 64cbfb9 96e2315 f527df5 87d039f f527df5 96e2315 f527df5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
FROM archlinux:latest
ENV DEBIAN_FRONTEND=noninteractive
# passed from space environment
ARG MODEL_ID="unsloth/Qwen3-1.7B-GGUF"
ARG QUANT="Q4_K_M"
ARG SERVED_NAME="qwen-nano"
ARG PARALLEL=4
ARG CTX_SIZE=8192
ARG EMBEDDING_ONLY=0
ARG RERANK_ONLY=0
# llama.cpp env configs
ENV LLAMA_ARG_HF_REPO="${MODEL_ID}:${QUANT}"
ENV LLAMA_ARG_CTX_SIZE=${CTX_SIZE}
ENV LLAMA_ARG_BATCH=512
ENV LLAMA_ARG_N_PARALLEL=${PARALLEL}
ENV LLAMA_ARG_FLASH_ATTN=1
ENV LLAMA_ARG_CACHE_TYPE_K="q8_0"
ENV LLAMA_ARG_CACHE_TYPE_V="q4_1"
ENV LLAMA_ARG_MLOCK=1
ENV LLAMA_ARG_N_GPU_LAYERS=0
ENV LLAMA_ARG_HOST="0.0.0.0"
ENV LLAMA_ARG_PORT=7860
ENV LLAMA_ARG_ALIAS="${SERVED_NAME}"
ENV LLAMA_ARG_EMBEDDINGS=${EMBEDDING_ONLY}
ENV LLAMA_ARG_RERANKING=${RERANK_ONLY}
RUN pacman -Syu --noconfirm --overwrite '*'
RUN pacman -S base-devel git git-lfs cmake curl openblas openblas64 blas64-openblas python gcc-libs glibc --noconfirm --overwrite '*'
RUN mkdir -p /app && mkdir -p /.cache
# cache dir for llama.cpp to download models
RUN chmod -R 777 /.cache
WORKDIR /app
RUN git clone --depth 1 --single-branch --branch master https://github.com/ggml-org/llama.cpp.git
# RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git llama.cpp
WORKDIR /app/llama.cpp
RUN cmake -B build \
-DGGML_LTO=ON \
-DLLAMA_CURL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_BUILD_EXAMPLES=ON \
-DGGML_ALL_WARNINGS=OFF \
-DGGML_ALL_WARNINGS_3RD_PARTY=OFF \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_NATIVE=ON \
-DGGML_LLAMAFILE=ON \
-Wno-dev \
-DCMAKE_BUILD_TYPE=Release
RUN cmake --build build --config Release --target llama-server -j $(nproc)
WORKDIR /app
EXPOSE 7860
CMD ["/app/llama.cpp/build/bin/llama-server", "--verbose-prompt", "--swa-full"]
|