# Use an official Python runtime as the base image
FROM python:3.9

# Ensure Python output is unbuffered for real-time logging
ENV PYTHONUNBUFFERED=1

# Set HOME so that PaddleOCR uses /app/.paddleocr instead of the root directory.
ENV HOME=/app
# Set custom cache directories for PaddleOCR and Transformers (for large model files)
ENV PADDLEOCR_CACHE_DIR=/app/.cache/paddleocr
ENV TRANSFORMERS_CACHE=/app/.cache/huggingface

# Create /app (ensuring HOME exists), install system dependencies including Git LFS and ccache,
# then set up Git LFS configuration.
RUN mkdir -p /app && \
    apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        libgl1-mesa-glx \
        libglib2.0-0 \
        git-lfs \
        ccache && \
    # Create an empty .gitconfig to satisfy Git LFS requirements.
    touch /app/.gitconfig && \
    git lfs install --force && \
    rm -rf /var/lib/apt/lists/*

# Create necessary directories with full permissions so that model files can be downloaded.
RUN mkdir -p /app/.paddleocr /app/.cache/paddleocr /app/.cache/huggingface && \
    chmod -R 777 /app/.paddleocr /app/.cache/paddleocr /app/.cache/huggingface

# Set the working directory for the container.
WORKDIR /app

# Copy the repository contents into the container.
COPY . /app

# Upgrade pip and install Python dependencies.
# Note: We install PaddlePaddle (CPU-only) using the official find-links URL, then PaddleOCR.
RUN pip install --upgrade pip setuptools wheel && \
    pip install --no-cache-dir \
    flask==3.1.0 \
    opencv-python-headless==4.11.0.86 \
    numpy==1.23.5 \
    spacy==3.8.3 \
    spacy-legacy==3.0.12 \
    spacy-loggers==1.0.5 \
    sentence-transformers==3.4.1 \
    transformers==4.49.0 \
    torch==2.6.0 \
    requests==2.32.3 && \
  pip install --no-cache-dir paddlepaddle==2.6.2 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html && \
  pip install --no-cache-dir paddleocr==2.9.1

# Download spaCy's English model ("en_core_web_md")
RUN python -m spacy download en_core_web_md

# Pre-load heavy models and tokenizers so they're cached inside the image:
#   • SentenceTransformer's "all-mpnet-base-v2"
#   • spaCy's "en_core_web_md"
#   • Transformers model and tokenizer "roberta-large-mnli" used by your entailment classifier
#   • PaddleOCR's models (downloaded into /app/.paddleocr)
#RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-mpnet-base-v2')" && \
 #   python -c "import spacy; spacy.load('en_core_web_md')" && \
  #  python -c "from transformers import AutoModel, AutoTokenizer; AutoModel.from_pretrained('roberta-large-mnli'); AutoTokenizer.from_pretrained('roberta-large-mnli')" && \
   # python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en')"

# Expose port 7860 for the Flask application.
EXPOSE 7860

# Specify the Flask application entry point.
ENV FLASK_APP=app.py

# Run the Flask application when the container starts.
CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]