2025-03-20 18:52:06 -07:00
|
|
|
|
FROM python:3.12-slim
|
|
|
|
|
|
|
|
|
|
|
|
WORKDIR /app
|
|
|
|
|
|
|
2025-07-20 11:42:55 +03:00
|
|
|
|
# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
|
2025-03-20 18:52:06 -07:00
|
|
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
|
|
|
gcc \
|
|
|
|
|
|
python3-dev \
|
2025-07-20 11:42:55 +03:00
|
|
|
|
ca-certificates \
|
|
|
|
|
|
curl \
|
|
|
|
|
|
wget \
|
|
|
|
|
|
unzip \
|
|
|
|
|
|
gnupg2 \
|
2025-08-13 17:25:34 -07:00
|
|
|
|
espeak-ng \
|
|
|
|
|
|
libsndfile1 \
|
2025-08-24 22:53:35 +02:00
|
|
|
|
libgl1 \
|
|
|
|
|
|
libglib2.0-0 \
|
|
|
|
|
|
libsm6 \
|
|
|
|
|
|
libxext6 \
|
|
|
|
|
|
libxrender1 \
|
2025-12-07 02:01:09 -08:00
|
|
|
|
dos2unix \
|
2026-01-26 03:24:05 +05:30
|
|
|
|
git \
|
2025-03-20 18:52:06 -07:00
|
|
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
2026-02-13 16:16:02 -08:00
|
|
|
|
# Install Pandoc 3.x from GitHub as a fallback for Linux where pypandoc_binary
|
|
|
|
|
|
# may not bundle pandoc (apt ships 2.17 which has broken table rendering).
|
|
|
|
|
|
# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks this up.
|
2026-02-12 15:32:07 +05:30
|
|
|
|
RUN ARCH=$(dpkg --print-architecture) && \
|
|
|
|
|
|
wget -qO /tmp/pandoc.deb "https://github.com/jgm/pandoc/releases/download/3.9/pandoc-3.9-1-${ARCH}.deb" && \
|
|
|
|
|
|
dpkg -i /tmp/pandoc.deb && \
|
|
|
|
|
|
rm /tmp/pandoc.deb
|
|
|
|
|
|
|
2025-07-20 11:42:55 +03:00
|
|
|
|
# Update certificates and install SSL tools
|
|
|
|
|
|
RUN update-ca-certificates
|
|
|
|
|
|
RUN pip install --upgrade certifi pip-system-certs
|
|
|
|
|
|
|
2025-03-20 18:52:06 -07:00
|
|
|
|
# Copy requirements
|
|
|
|
|
|
COPY pyproject.toml .
|
|
|
|
|
|
COPY uv.lock .
|
|
|
|
|
|
|
2026-05-03 00:39:27 -07:00
|
|
|
|
# Install all Python dependencies from uv.lock for deterministic builds.
|
|
|
|
|
|
#
|
|
|
|
|
|
# `uv pip install -e .` re-resolves from pyproject.toml and ignores uv.lock,
|
|
|
|
|
|
# which lets prod silently drift to newer upstream versions on every rebuild
|
|
|
|
|
|
# (e.g. deepagents 0.4.x -> 0.5.x breaking the FilesystemMiddleware imports).
|
|
|
|
|
|
# Exporting the lock to requirements.txt and feeding it to `uv pip install`
|
|
|
|
|
|
# pins every transitive package to the exact version captured in uv.lock.
|
|
|
|
|
|
#
|
|
|
|
|
|
# Note on torch/CUDA: we do NOT install torch from a separate cu* index here.
|
|
|
|
|
|
# PyPI's torch wheels for Linux x86_64 already ship CUDA-enabled and pull
|
|
|
|
|
|
# nvidia-cudnn-cu13, nvidia-nccl-cu13, triton, etc. as install deps (all
|
|
|
|
|
|
# captured in uv.lock). Installing from cu121 first only wasted ~2GB of
|
|
|
|
|
|
# downloads that the lock-based install immediately replaced. If a specific
|
|
|
|
|
|
# CUDA version is needed (driver compatibility, etc.), wire it through
|
|
|
|
|
|
# [tool.uv.sources] in pyproject.toml so the lock stays the source of truth.
|
2025-03-20 18:52:06 -07:00
|
|
|
|
RUN pip install --no-cache-dir uv && \
|
2026-05-03 00:39:27 -07:00
|
|
|
|
uv export --frozen --no-dev --no-hashes --no-emit-project \
|
|
|
|
|
|
--format requirements-txt -o /tmp/requirements.txt && \
|
|
|
|
|
|
uv pip install --system --no-cache-dir -r /tmp/requirements.txt && \
|
|
|
|
|
|
rm /tmp/requirements.txt
|
2025-03-20 18:52:06 -07:00
|
|
|
|
|
2025-07-20 11:42:55 +03:00
|
|
|
|
# Set SSL environment variables dynamically
|
|
|
|
|
|
RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
|
|
|
|
|
|
echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
|
|
|
|
|
|
echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
|
|
|
|
|
|
echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
|
|
|
|
|
|
ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
|
|
|
|
|
|
ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
|
|
|
|
|
|
|
|
|
|
|
|
# Pre-download EasyOCR models to avoid runtime SSL issues
|
|
|
|
|
|
RUN mkdir -p /root/.EasyOCR/model
|
|
|
|
|
|
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
|
|
|
|
|
|
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
|
|
|
|
|
|
RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
|
|
|
|
|
|
|
|
|
|
|
|
# Pre-download Docling models
|
|
|
|
|
|
RUN python -c "try:\n from docling.document_converter import DocumentConverter\n conv = DocumentConverter()\nexcept:\n pass" || true
|
|
|
|
|
|
|
2026-05-03 00:39:27 -07:00
|
|
|
|
# Install Playwright browsers for web scraping (the playwright package itself
|
|
|
|
|
|
# is already installed via uv.lock above)
|
|
|
|
|
|
RUN playwright install chromium --with-deps
|
2025-03-20 18:52:06 -07:00
|
|
|
|
|
|
|
|
|
|
# Copy source code
|
|
|
|
|
|
COPY . .
|
|
|
|
|
|
|
2026-05-03 00:39:27 -07:00
|
|
|
|
# Install the project itself in editable mode. Dependencies were already
|
|
|
|
|
|
# installed deterministically from uv.lock above, so --no-deps prevents any
|
|
|
|
|
|
# re-resolution that could pull newer versions.
|
|
|
|
|
|
RUN uv pip install --system --no-cache-dir --no-deps -e .
|
|
|
|
|
|
|
2025-10-23 15:49:16 -07:00
|
|
|
|
# Copy and set permissions for entrypoint script
|
2025-12-07 02:01:09 -08:00
|
|
|
|
# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
|
2025-10-23 15:49:16 -07:00
|
|
|
|
COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
|
2025-12-07 02:01:09 -08:00
|
|
|
|
RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
|
2025-10-23 15:49:16 -07:00
|
|
|
|
|
2026-02-10 22:57:56 -08:00
|
|
|
|
# Shared temp directory for file uploads between API and Worker containers.
|
|
|
|
|
|
# Python's tempfile module uses TMPDIR, so uploaded files land here.
|
|
|
|
|
|
# Mount the SAME volume at /shared_tmp on both API and Worker in Coolify.
|
|
|
|
|
|
RUN mkdir -p /shared_tmp
|
|
|
|
|
|
ENV TMPDIR=/shared_tmp
|
|
|
|
|
|
|
2025-03-20 18:52:06 -07:00
|
|
|
|
# Prevent uvloop compatibility issues
|
|
|
|
|
|
ENV PYTHONPATH=/app
|
|
|
|
|
|
ENV UVICORN_LOOP=asyncio
|
|
|
|
|
|
|
2026-02-28 23:59:28 -08:00
|
|
|
|
# Tune glibc malloc to return freed memory to the OS more aggressively.
|
|
|
|
|
|
# Without these, Python's gc.collect() frees objects but the underlying
|
|
|
|
|
|
# C heap pages stay mapped (RSS never drops) due to sbrk fragmentation.
|
|
|
|
|
|
ENV MALLOC_MMAP_THRESHOLD_=65536
|
|
|
|
|
|
ENV MALLOC_TRIM_THRESHOLD_=131072
|
|
|
|
|
|
ENV MALLOC_MMAP_MAX_=65536
|
|
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
# SERVICE_ROLE controls which process this container runs:
|
|
|
|
|
|
# api – FastAPI backend only (runs migrations on startup)
|
|
|
|
|
|
# worker – Celery worker only
|
|
|
|
|
|
# beat – Celery beat scheduler only
|
|
|
|
|
|
# all – All three (legacy / dev default)
|
|
|
|
|
|
ENV SERVICE_ROLE=all
|
|
|
|
|
|
|
|
|
|
|
|
# Celery worker tuning (only used when SERVICE_ROLE=worker or all)
|
|
|
|
|
|
ENV CELERY_MAX_WORKERS=10
|
|
|
|
|
|
ENV CELERY_MIN_WORKERS=2
|
|
|
|
|
|
ENV CELERY_MAX_TASKS_PER_CHILD=50
|
|
|
|
|
|
# CELERY_QUEUES: comma-separated queues to consume (empty = all queues)
|
|
|
|
|
|
# "surfsense" – fast tasks only (file uploads, podcasts, etc.)
|
|
|
|
|
|
# "surfsense.connectors" – slow connector indexing tasks only
|
|
|
|
|
|
# "" – both queues (default, for single-worker setups)
|
|
|
|
|
|
ENV CELERY_QUEUES=""
|
|
|
|
|
|
|
2025-03-20 18:52:06 -07:00
|
|
|
|
# Run
|
2025-10-28 23:35:53 -07:00
|
|
|
|
EXPOSE 8000-8001
|
2026-02-16 23:18:29 -08:00
|
|
|
|
CMD ["/app/scripts/docker/entrypoint.sh"]
|