Spaces:
Runtime error
Runtime error
File size: 2,445 Bytes
2532c03 4c3b216 755e9cf be83296 f9fe302 d1140a8 be83296 755e9cf 2532c03 c3a9a1f be83296 2532c03 ad6491a f9fe302 ad6491a 827332d f9fe302 852520e 15afe55 2532c03 78f1573 396c410 f637c1a 2da7e92 c3a9a1f 2532c03 1ea03e2 a5a503a e2e4dc9 f2bd049 2532c03 be83296 b1a7a2d 6050ba1 2532c03 827332d 2532c03 827332d 2532c03 2987771 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# Use CUDA 11.8 with cuDNN 8
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
ENV DEBIAN_FRONTEND=noninteractive
# Install basic utilities
RUN apt-get update && apt-get install -y \
tzdata \
software-properties-common \
curl && \
ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
echo "Etc/UTC" > /etc/timezone
# Install Python 3.11
RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
apt-get install -y python3.11 python3.11-dev
# Install pip
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
# Install Python packages (torch, xFormers, Transformers, etc.)
RUN python -m pip install --no-cache-dir \
torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
python -m pip install --no-cache-dir \
xformers==0.0.27 \
--extra-index-url https://download.pytorch.org/whl/nightly/cu118/torch_nightly.html && \
python -m pip install --no-cache-dir \
transformers \
accelerate \
trl \
unsloth \
pandas \
datasets \
huggingface_hub \
safetensors \
bitsandbytes
# Helps reduce CUDA memory fragmentation
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Set your cache directories
ENV HF_HOME=/workspace/.huggingface
ENV TRANSFORMERS_CACHE=/workspace/.huggingface
ENV DATASETS_CACHE=/workspace/.cache
ENV TORCH_HOME=/workspace/.cache
ENV XDG_CACHE_HOME=/workspace/.cache
ENV TRITON_CACHE_DIR=/workspace/.cache/triton
# Create directories and set permissions
RUN mkdir -p /workspace/outputs \
&& mkdir -p /workspace/.huggingface \
&& mkdir -p /workspace/.cache/triton \
&& chmod -R 777 /workspace
WORKDIR /workspace
# Copy training script and data
COPY finetune_script.py /workspace/
COPY train.csv /workspace/train.csv
COPY valid.csv /workspace/valid.csv
# Preconfigure Accelerate for multi-GPU usage (4 GPUs, fp16)
RUN mkdir -p /root/.cache/huggingface/accelerate && \
echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' \
> /root/.cache/huggingface/accelerate/default_config.yaml
# Launch your training script on 4 GPUs with fp16
CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]
|