File size: 2,402 Bytes
aa1694d
c46fce7
cc07d68
aa1694d
 
 
 
 
 
 
 
 
c46fce7
d81e914
 
 
 
c46fce7
aa1694d
 
 
 
 
 
 
 
 
 
cc07d68
c46fce7
aa1694d
d81e914
cc07d68
c46fce7
cc07d68
48500f7
 
c46fce7
d81e914
 
c46fce7
 
 
 
 
 
 
d81e914
c46fce7
 
d81e914
 
cc07d68
c46fce7
aa1694d
c46fce7
 
aa1694d
 
ec85693
 
 
 
 
 
 
 
 
d84ecbc
 
c46fce7
aa1694d
732e860
aa1694d
ec85693
e9537a4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# syntax=docker/dockerfile:1.6

ARG PY_BASE=python:3.9-slim-bullseye
FROM ${PY_BASE}

ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1 \
    TOKENIZERS_PARALLELISM=false \
    OMP_NUM_THREADS=1 \
    TRANSFORMERS_CACHE=/cache/hf

# Force a rebuild when you tweak deps
ARG DEPS_REFRESH=2025-09-07-06
ENV DEPS_REFRESH=$DEPS_REFRESH
RUN echo "CACHEBUSTER=$DEPS_REFRESH"

# System deps (tesseract + libs for opencv wheels)
RUN apt-get update && apt-get install -y --no-install-recommends \
    ca-certificates curl \
    tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd \
    libgl1 libglib2.0-0 \
 && rm -rf /var/lib/apt/lists/*

ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata

WORKDIR /app
COPY requirements.txt .

# Install python deps
RUN python -m pip install --upgrade pip setuptools wheel \
 && pip install --no-cache-dir -r requirements.txt

# Install spaCy model matching spaCy 3.2.x WITHOUT pulling new deps
RUN pip install --no-deps \
  "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"

# Robust version dump (no __version__ attribute assumptions)
RUN python - <<'PY'
import sys, pkgutil
try:
    import importlib.metadata as md
except ImportError:
    import importlib_metadata as md  # py3.8 fallback (not used here)
def v(name):
    try: return md.version(name)
    except md.PackageNotFoundError: return "not-installed"
print("python:", sys.version.split()[0])
for name in ("pydantic","typing-extensions","spacy","thinc","en-core-web-sm"):
    print(f"{name}:", v(name))
print("has en_core_web_sm:", bool(pkgutil.find_loader("en_core_web_sm")))
PY

# App code
COPY . .

# Writable caches
RUN mkdir -p /cache/hf /tmp && chmod -R 777 /cache /tmp


# If you have a starter DB in the repo, uncomment the next line to seed it:
RUN mkdir -p /data && chmod -R 777 /data
#COPY app.db /data/app.db
ENV DB_DIR=/data
ENV DB_PATH=/data/app.db
# (optional) expose as a volume so you can mount from host if you want persistence
VOLUME ["/data"]

#ENV PORT=8000
EXPOSE 7860

HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
  CMD curl -fsS "http://127.0.0.1:${PORT:-7860}/api/health/" || exit 1

# bind to $PORT provided by HF; include proxy headers
CMD ["sh","-c","uvicorn backend:app --host 0.0.0.0 --port ${PORT:-7860} --proxy-headers --forwarded-allow-ips='*'"]