grade / Dockerfile
kerols77's picture
Update Dockerfile
d6849d3 verified
# Use an official Python runtime as the base image
FROM python:3.9
# Ensure Python output is unbuffered for real-time logging
ENV PYTHONUNBUFFERED=1
# Set HOME so that PaddleOCR uses /app/.paddleocr instead of the root directory.
ENV HOME=/app
# Set custom cache directories for PaddleOCR and Transformers (for large model files)
ENV PADDLEOCR_CACHE_DIR=/app/.cache/paddleocr
ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
# Create /app (ensuring HOME exists), install system dependencies including Git LFS and ccache,
# then set up Git LFS configuration.
RUN mkdir -p /app && \
apt-get update && apt-get install -y --no-install-recommends \
build-essential \
libgl1-mesa-glx \
libglib2.0-0 \
git-lfs \
ccache && \
# Create an empty .gitconfig to satisfy Git LFS requirements.
touch /app/.gitconfig && \
git lfs install --force && \
rm -rf /var/lib/apt/lists/*
# Create necessary directories with full permissions so that model files can be downloaded.
RUN mkdir -p /app/.paddleocr /app/.cache/paddleocr /app/.cache/huggingface && \
chmod -R 777 /app/.paddleocr /app/.cache/paddleocr /app/.cache/huggingface
# Set the working directory for the container.
WORKDIR /app
# Copy the repository contents into the container.
COPY . /app
# Upgrade pip and install Python dependencies.
# Note: We install PaddlePaddle (CPU-only) using the official find-links URL, then PaddleOCR.
RUN pip install --upgrade pip setuptools wheel && \
pip install --no-cache-dir \
flask==3.1.0 \
opencv-python-headless==4.11.0.86 \
numpy==1.23.5 \
spacy==3.8.3 \
spacy-legacy==3.0.12 \
spacy-loggers==1.0.5 \
sentence-transformers==3.4.1 \
transformers==4.49.0 \
torch==2.6.0 \
requests==2.32.3 && \
pip install --no-cache-dir paddlepaddle==2.6.2 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html && \
pip install --no-cache-dir paddleocr==2.9.1
# Download spaCy's English model ("en_core_web_md")
RUN python -m spacy download en_core_web_md
# Pre-load heavy models and tokenizers so they're cached inside the image:
# • SentenceTransformer's "all-mpnet-base-v2"
# • spaCy's "en_core_web_md"
# • Transformers model and tokenizer "roberta-large-mnli" used by your entailment classifier
# • PaddleOCR's models (downloaded into /app/.paddleocr)
#RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-mpnet-base-v2')" && \
# python -c "import spacy; spacy.load('en_core_web_md')" && \
# python -c "from transformers import AutoModel, AutoTokenizer; AutoModel.from_pretrained('roberta-large-mnli'); AutoTokenizer.from_pretrained('roberta-large-mnli')" && \
# python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')"
# Expose port 7860 for the Flask application.
EXPOSE 7860
# Specify the Flask application entry point.
ENV FLASK_APP=app.py
# Run the Flask application when the container starts.
CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]