# Use an official Python runtime as the base image FROM python:3.9 # Ensure Python output is unbuffered for real-time logging ENV PYTHONUNBUFFERED=1 # Set HOME so that PaddleOCR uses /app/.paddleocr instead of the root directory. ENV HOME=/app # Set custom cache directories for PaddleOCR and Transformers (for large model files) ENV PADDLEOCR_CACHE_DIR=/app/.cache/paddleocr ENV TRANSFORMERS_CACHE=/app/.cache/huggingface # Create /app (ensuring HOME exists), install system dependencies including Git LFS and ccache, # then set up Git LFS configuration. RUN mkdir -p /app && \ apt-get update && apt-get install -y --no-install-recommends \ build-essential \ libgl1-mesa-glx \ libglib2.0-0 \ git-lfs \ ccache && \ # Create an empty .gitconfig to satisfy Git LFS requirements. touch /app/.gitconfig && \ git lfs install --force && \ rm -rf /var/lib/apt/lists/* # Create necessary directories with full permissions so that model files can be downloaded. RUN mkdir -p /app/.paddleocr /app/.cache/paddleocr /app/.cache/huggingface && \ chmod -R 777 /app/.paddleocr /app/.cache/paddleocr /app/.cache/huggingface # Set the working directory for the container. WORKDIR /app # Copy the repository contents into the container. COPY . /app # Upgrade pip and install Python dependencies. # Note: We install PaddlePaddle (CPU-only) using the official find-links URL, then PaddleOCR. RUN pip install --upgrade pip setuptools wheel && \ pip install --no-cache-dir \ flask==3.1.0 \ opencv-python-headless==4.11.0.86 \ numpy==1.23.5 \ spacy==3.8.3 \ spacy-legacy==3.0.12 \ spacy-loggers==1.0.5 \ sentence-transformers==3.4.1 \ transformers==4.49.0 \ torch==2.6.0 \ requests==2.32.3 && \ pip install --no-cache-dir paddlepaddle==2.6.2 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html && \ pip install --no-cache-dir paddleocr==2.9.1 # Download spaCy's English model ("en_core_web_md") RUN python -m spacy download en_core_web_md # Pre-load heavy models and tokenizers so they're cached inside the image: # • SentenceTransformer's "all-mpnet-base-v2" # • spaCy's "en_core_web_md" # • Transformers model and tokenizer "roberta-large-mnli" used by your entailment classifier # • PaddleOCR's models (downloaded into /app/.paddleocr) #RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-mpnet-base-v2')" && \ # python -c "import spacy; spacy.load('en_core_web_md')" && \ # python -c "from transformers import AutoModel, AutoTokenizer; AutoModel.from_pretrained('roberta-large-mnli'); AutoTokenizer.from_pretrained('roberta-large-mnli')" && \ # python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')" # Expose port 7860 for the Flask application. EXPOSE 7860 # Specify the Flask application entry point. ENV FLASK_APP=app.py # Run the Flask application when the container starts. CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]