File size: 1,138 Bytes
380fa4c
3fbb6a1
 
be51358
3fbb6a1
 
 
937d5fa
3fbb6a1
 
 
43b8a1d
937d5fa
 
be51358
32e9a12
 
d476ac9
380fa4c
 
 
 
 
be51358
 
 
32e9a12
 
 
 
3fbb6a1
380fa4c
937d5fa
3fbb6a1
32e9a12
937d5fa
3fbb6a1
 
32e9a12
3fbb6a1
 
32e9a12
937d5fa
3fbb6a1
32e9a12
be51358
 
380fa4c
32e9a12
 
380fa4c
3fbb6a1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Use a lightweight Python base image
FROM python:3.10-slim

# Install system-level dependencies
RUN apt-get update && apt-get install -y \
    tesseract-ocr \
    libglib2.0-0 \
    libgl1 \
    libsm6 \
    libxext6 \
    libxrender-dev \
    poppler-utils \
    ffmpeg \
    git \
    build-essential \
    curl \
 && rm -rf /var/lib/apt/lists/*

# Verify Tesseract installation
RUN ln -s /usr/bin/tesseract /usr/local/bin/tesseract && \
    tesseract --version

# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1
ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
ENV TESSERACT_PATH=/usr/bin/tesseract
ENV PATH="${TESSERACT_PATH}:${PATH}"

# Set working directory
WORKDIR /home/user/app

# Install Python dependencies
COPY requirements.txt .
RUN pip install --upgrade pip && pip install -r requirements.txt

# Copy application code
COPY . .

# Download NLTK tokenizer
RUN python -m nltk.downloader punkt

# Expose port for Gradio
EXPOSE 7860

# Optional: container health check
HEALTHCHECK CMD curl --fail http://localhost:7860 || exit 1

# Start the application
CMD ["python", "app.py"]