# Use an official Python runtime as the base image | |
FROM python:3.9 | |
# Ensure Python output is unbuffered for real-time logging | |
ENV PYTHONUNBUFFERED=1 | |
# Set HOME so that PaddleOCR uses /app/.paddleocr instead of the root directory. | |
ENV HOME=/app | |
# Set custom cache directories for PaddleOCR and Transformers (for large model files) | |
ENV PADDLEOCR_CACHE_DIR=/app/.cache/paddleocr | |
ENV TRANSFORMERS_CACHE=/app/.cache/huggingface | |
# Create /app (ensuring HOME exists), install system dependencies including Git LFS and ccache, | |
# then set up Git LFS configuration. | |
RUN mkdir -p /app && \ | |
apt-get update && apt-get install -y --no-install-recommends \ | |
build-essential \ | |
libgl1-mesa-glx \ | |
libglib2.0-0 \ | |
git-lfs \ | |
ccache && \ | |
# Create an empty .gitconfig to satisfy Git LFS requirements. | |
touch /app/.gitconfig && \ | |
git lfs install --force && \ | |
rm -rf /var/lib/apt/lists/* | |
# Create necessary directories with full permissions so that model files can be downloaded. | |
RUN mkdir -p /app/.paddleocr /app/.cache/paddleocr /app/.cache/huggingface && \ | |
chmod -R 777 /app/.paddleocr /app/.cache/paddleocr /app/.cache/huggingface | |
# Set the working directory for the container. | |
WORKDIR /app | |
# Copy the repository contents into the container. | |
COPY . /app | |
# Upgrade pip and install Python dependencies. | |
# Note: We install PaddlePaddle (CPU-only) using the official find-links URL, then PaddleOCR. | |
RUN pip install --upgrade pip setuptools wheel && \ | |
pip install --no-cache-dir \ | |
flask==3.1.0 \ | |
opencv-python-headless==4.11.0.86 \ | |
numpy==1.23.5 \ | |
spacy==3.8.3 \ | |
spacy-legacy==3.0.12 \ | |
spacy-loggers==1.0.5 \ | |
sentence-transformers==3.4.1 \ | |
transformers==4.49.0 \ | |
torch==2.6.0 \ | |
requests==2.32.3 && \ | |
pip install --no-cache-dir paddlepaddle==2.6.2 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html && \ | |
pip install --no-cache-dir paddleocr==2.9.1 | |
# Download spaCy's English model ("en_core_web_md") | |
RUN python -m spacy download en_core_web_md | |
# Pre-load heavy models and tokenizers so they're cached inside the image: | |
# • SentenceTransformer's "all-mpnet-base-v2" | |
# • spaCy's "en_core_web_md" | |
# • Transformers model and tokenizer "roberta-large-mnli" used by your entailment classifier | |
# • PaddleOCR's models (downloaded into /app/.paddleocr) | |
#RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-mpnet-base-v2')" && \ | |
# python -c "import spacy; spacy.load('en_core_web_md')" && \ | |
# python -c "from transformers import AutoModel, AutoTokenizer; AutoModel.from_pretrained('roberta-large-mnli'); AutoTokenizer.from_pretrained('roberta-large-mnli')" && \ | |
# python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')" | |
# Expose port 7860 for the Flask application. | |
EXPOSE 7860 | |
# Specify the Flask application entry point. | |
ENV FLASK_APP=app.py | |
# Run the Flask application when the container starts. | |
CMD | |