Spaces:

daresearch
/

train_70b_4bit

Runtime error

File size: 2,445 Bytes

2532c03
4c3b216
 
755e9cf
be83296
f9fe302
d1140a8
 
 
 
 
be83296
755e9cf
2532c03
c3a9a1f
 
be83296
2532c03
ad6491a
 
 
f9fe302
ad6491a
827332d
 
f9fe302
 
 
852520e
 
 
 
 
 
 
 
 
15afe55
2532c03
78f1573
 
396c410
f637c1a
 
 
 
2da7e92
 
c3a9a1f
2532c03
1ea03e2
 
 
 
a5a503a
e2e4dc9
f2bd049
2532c03
be83296
b1a7a2d
 
6050ba1
2532c03
827332d
2532c03
 
827332d
2532c03
2987771

# Use CUDA 11.8 with cuDNN 8
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04

ENV DEBIAN_FRONTEND=noninteractive

# Install basic utilities
RUN apt-get update && apt-get install -y \
    tzdata \
    software-properties-common \
    curl && \
    ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
    echo "Etc/UTC" > /etc/timezone

# Install Python 3.11
RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && \
    apt-get install -y python3.11 python3.11-dev

# Install pip
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1

# Install Python packages (torch, xFormers, Transformers, etc.)
RUN python -m pip install --no-cache-dir \
    torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 && \
    python -m pip install --no-cache-dir \
    xformers==0.0.27 \
        --extra-index-url https://download.pytorch.org/whl/nightly/cu118/torch_nightly.html && \
    python -m pip install --no-cache-dir \
    transformers \
    accelerate \
    trl \
    unsloth \
    pandas \
    datasets \
    huggingface_hub \
    safetensors \
    bitsandbytes

# Helps reduce CUDA memory fragmentation
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# Set your cache directories
ENV HF_HOME=/workspace/.huggingface
ENV TRANSFORMERS_CACHE=/workspace/.huggingface
ENV DATASETS_CACHE=/workspace/.cache
ENV TORCH_HOME=/workspace/.cache
ENV XDG_CACHE_HOME=/workspace/.cache
ENV TRITON_CACHE_DIR=/workspace/.cache/triton

# Create directories and set permissions
RUN mkdir -p /workspace/outputs \
    && mkdir -p /workspace/.huggingface \
    && mkdir -p /workspace/.cache/triton \
    && chmod -R 777 /workspace

WORKDIR /workspace

# Copy training script and data
COPY finetune_script.py /workspace/
COPY train.csv /workspace/train.csv
COPY valid.csv /workspace/valid.csv

# Preconfigure Accelerate for multi-GPU usage (4 GPUs, fp16)
RUN mkdir -p /root/.cache/huggingface/accelerate && \
    echo '{"compute_environment": "LOCAL_MACHINE", "distributed_type": "MULTI_GPU", "num_processes": 4, "mixed_precision": "fp16", "machine_rank": 0, "main_training_function": "main", "use_cpu": false, "num_machines": 1}' \
    > /root/.cache/huggingface/accelerate/default_config.yaml

# Launch your training script on 4 GPUs with fp16
CMD ["accelerate", "launch", "--num_processes=4", "--mixed_precision=fp16", "finetune_script.py"]