Merge remote-tracking branch 'upstream/dev' into feature/multi-agent-with-task-parallelization

2026-05-19 18:45:15 +02:00 · 2026-05-15 16:44:22 +02:00 · 2026-05-15 16:44:22 +02:00 · 4980f9f1ba
commit 4980f9f1ba
parent 5327f3348c eea2d68098
193 changed files with 32777 additions and 565 deletions
--- a/surfsense_backend/.gitignore
+++ b/surfsense_backend/.gitignore
@ -13,5 +13,5 @@ celerybeat-schedule*
 celerybeat-schedule.*
 celerybeat-schedule.dir
 celerybeat-schedule.bak
-global_llm_config.yaml
+/app/config/global_llm_config.yaml
 app/templates/_generated/
--- a/surfsense_backend/Dockerfile
+++ b/surfsense_backend/Dockerfile
@ -1,8 +1,16 @@
-FROM python:3.12-slim
+# =============================================================================
+# SurfSense Backend — Multi-stage Dockerfile
+# =============================================================================
+# Graph: base → deps → models → {e2e, production}
+#   e2e        — tests/ via additional_contexts (docker-compose.e2e.yml)
+#   production — published ghcr.io image (docker-build.yml pins target)
+# =============================================================================
+
+# ─── Stage 1: base (system deps, Pandoc, certificates) ──────────────────────
+FROM python:3.12-slim AS base

 WORKDIR /app

-# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
 RUN apt-get update && apt-get install -y --no-install-recommends \
    gcc \
    python3-dev \
@ -11,6 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    wget \
    unzip \
    gnupg2 \
+    ffmpeg \
    espeak-ng \
    libsndfile1 \
    libgl1 \
@ -22,21 +31,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    && rm -rf /var/lib/apt/lists/*

-# Install Pandoc 3.x from GitHub as a fallback for Linux where pypandoc_binary
-# may not bundle pandoc (apt ships 2.17 which has broken table rendering).
-# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks this up.
+RUN which ffmpeg && ffmpeg -version
+
+# Pandoc 3.x from GitHub Releases — apt ships 2.17 which has broken table rendering.
+# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks up this binary.
 RUN ARCH=$(dpkg --print-architecture) && \
    wget -qO /tmp/pandoc.deb "https://github.com/jgm/pandoc/releases/download/3.9/pandoc-3.9-1-${ARCH}.deb" && \
    dpkg -i /tmp/pandoc.deb && \
    rm /tmp/pandoc.deb

-# Update certificates and install SSL tools
 RUN update-ca-certificates
 RUN pip install --upgrade certifi pip-system-certs

-# Copy requirements
-COPY pyproject.toml .
-COPY uv.lock .
+ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+ENV SURFSENSE_ALLOW_STATIC_FFMPEG_DOWNLOAD=FALSE
+
+
+# ─── Stage 2: deps (Python deps frozen from uv.lock) ────────────────────────
+FROM base AS deps
+
+COPY pyproject.toml uv.lock ./

 # Install all Python dependencies from uv.lock for deterministic builds.
 #
@ -49,9 +64,7 @@ COPY uv.lock .
 # Note on torch/CUDA: we do NOT install torch from a separate cu* index here.
 # PyPI's torch wheels for Linux x86_64 already ship CUDA-enabled and pull
 # nvidia-cudnn-cu13, nvidia-nccl-cu13, triton, etc. as install deps (all
-# captured in uv.lock). Installing from cu121 first only wasted ~2GB of
-# downloads that the lock-based install immediately replaced. If a specific
-# CUDA version is needed (driver compatibility, etc.), wire it through
+# captured in uv.lock). If a specific CUDA version is needed, wire it through
 # [tool.uv.sources] in pyproject.toml so the lock stays the source of truth.
 RUN pip install --no-cache-dir uv && \
    uv export --frozen --no-dev --no-hashes --no-emit-project \
@ -59,49 +72,42 @@ RUN pip install --no-cache-dir uv && \
    uv pip install --system --no-cache-dir -r /tmp/requirements.txt && \
    rm /tmp/requirements.txt

-# Set SSL environment variables dynamically
-RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
-    echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
-    echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
-    echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
-ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
-ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+
+# ─── Stage 3: models (pre-baked offline assets) ─────────────────────────────
+FROM deps AS models

 # Pre-download EasyOCR models to avoid runtime SSL issues
-RUN mkdir -p /root/.EasyOCR/model
-RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
-RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
-RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
+RUN mkdir -p /root/.EasyOCR/model && \
+    wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip      -O /root/.EasyOCR/model/english_g2.zip      || true && \
+    wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true && \
+    cd /root/.EasyOCR/model && \
+    (unzip -o english_g2.zip || true) && \
+    (unzip -o craft_mlt_25k.zip || true)

 # Pre-download Docling models
-RUN python -c "try:\n    from docling.document_converter import DocumentConverter\n    conv = DocumentConverter()\nexcept:\n    pass" || true
+RUN printf '%s\n' \
+    'try:' \
+    '    from docling.document_converter import DocumentConverter' \
+    '    DocumentConverter()' \
+    'except Exception:' \
+    '    pass' \
+    | python || true

-# Install Playwright browsers for web scraping (the playwright package itself
-# is already installed via uv.lock above)
+ARG EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+RUN python -c "from chonkie import AutoEmbeddings; AutoEmbeddings.get_embeddings('${EMBEDDING_MODEL}')"
+
+# Install Playwright browsers (the playwright python package itself is in deps)
 RUN playwright install chromium --with-deps

-# Copy source code
-COPY . .
-
-# Install the project itself in editable mode. Dependencies were already
-# installed deterministically from uv.lock above, so --no-deps prevents any
-# re-resolution that could pull newer versions.
-RUN uv pip install --system --no-cache-dir --no-deps -e .
-
-# Copy and set permissions for entrypoint script
-# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
-COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
-RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
-
 # Shared temp directory for file uploads between API and Worker containers.
 # Python's tempfile module uses TMPDIR, so uploaded files land here.
 # Mount the SAME volume at /shared_tmp on both API and Worker in Coolify.
 RUN mkdir -p /shared_tmp
-ENV TMPDIR=/shared_tmp

-# Prevent uvloop compatibility issues
 ENV PYTHONPATH=/app
 ENV UVICORN_LOOP=asyncio
+ENV TMPDIR=/shared_tmp
+ENV PYTHONUNBUFFERED=1

 # Tune glibc malloc to return freed memory to the OS more aggressively.
 # Without these, Python's gc.collect() frees objects but the underlying
@ -110,6 +116,56 @@ ENV MALLOC_MMAP_THRESHOLD_=65536
 ENV MALLOC_TRIM_THRESHOLD_=131072
 ENV MALLOC_MMAP_MAX_=65536

+
+# ─── Stage 4: e2e (production source + tests/ + e2e entrypoint) ─────────────
+# Built via `docker buildx build --target e2e`. The default build target is
+# `production` (the last stage), so this stage is opt-in for CI only.
+#
+# `tests/` is excluded from the main build context by .dockerignore (so prod
+# can never accidentally ship test fakes). The e2e stage receives tests/
+# through an "additional context" passed by docker-compose.e2e.yml — see
+# https://docs.docker.com/reference/compose-file/build/#additional_contexts
+FROM models AS e2e
+
+# Same source copy as production. .dockerignore filters out tests/.
+COPY . .
+
+# Bring tests/ in via the named additional build context. CI passes
+#   --build-context tests-source=./tests
+# (or the equivalent additional_contexts entry in docker-compose.e2e.yml).
+COPY --from=tests-source . ./tests/
+
+# Install the project itself in editable mode. Dependencies were already
+# installed deterministically from uv.lock above, so --no-deps prevents any
+# re-resolution that could pull newer versions.
+RUN uv pip install --system --no-cache-dir --no-deps -e .
+
+COPY scripts/docker/entrypoint.e2e.sh /app/scripts/docker/entrypoint.e2e.sh
+RUN dos2unix /app/scripts/docker/entrypoint.e2e.sh && chmod +x /app/scripts/docker/entrypoint.e2e.sh
+
+# SERVICE_ROLE is overridden per service in docker-compose.e2e.yml (api / worker).
+ENV SERVICE_ROLE=api
+
+EXPOSE 8000-8001
+CMD ["/app/scripts/docker/entrypoint.e2e.sh"]
+
+
+# ─── Stage 5: production (published ghcr.io image) ──────────────────────────
+# CI pins `target: production`; also the default for `docker build` / dev compose.
+FROM models AS production
+
+# Copy source code (tests/ excluded by .dockerignore — production never ships tests).
+COPY . .
+
+# Install the project itself in editable mode. Dependencies were already
+# installed deterministically from uv.lock above, so --no-deps prevents any
+# re-resolution that could pull newer versions.
+RUN uv pip install --system --no-cache-dir --no-deps -e .
+
+# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
+COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
+RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
+
 # SERVICE_ROLE controls which process this container runs:
 #   api     – FastAPI backend only (runs migrations on startup)
 #   worker  – Celery worker only
@ -127,6 +183,5 @@ ENV CELERY_MAX_TASKS_PER_CHILD=50
 #   ""                       – both queues (default, for single-worker setups)
 ENV CELERY_QUEUES=""

-# Run
 EXPOSE 8000-8001
-CMD ["/app/scripts/docker/entrypoint.sh"]
+CMD ["/app/scripts/docker/entrypoint.sh"]
--- a/surfsense_backend/alembic/env.py
+++ b/surfsense_backend/alembic/env.py
@ -67,7 +67,11 @@ def run_migrations_offline() -> None:


 def do_run_migrations(connection: Connection) -> None:
-    context.configure(connection=connection, target_metadata=target_metadata)
+    context.configure(
+        connection=connection,
+        target_metadata=target_metadata,
+        transaction_per_migration=True,
+    )

    with context.begin_transaction():
        context.run_migrations()
--- a/surfsense_backend/alembic/versions/130_add_agent_action_log.py
+++ b/surfsense_backend/alembic/versions/130_add_agent_action_log.py
@ -26,6 +26,10 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
+    bind = op.get_bind()
+    if sa.inspect(bind).has_table("agent_action_log"):
+        return
+
    op.create_table(
        "agent_action_log",
        sa.Column("id", sa.Integer(), primary_key=True, index=True),
--- a/surfsense_backend/alembic/versions/131_add_document_revisions.py
+++ b/surfsense_backend/alembic/versions/131_add_document_revisions.py
@ -29,6 +29,21 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+
+    if inspector.has_table("document_revisions") and inspector.has_table(
+        "folder_revisions"
+    ):
+        return
+
+    if not inspector.has_table("document_revisions"):
+        _create_document_revisions()
+    if not inspector.has_table("folder_revisions"):
+        _create_folder_revisions()
+
+
+def _create_document_revisions() -> None:
    op.create_table(
        "document_revisions",
        sa.Column("id", sa.Integer(), primary_key=True, index=True),
@ -74,6 +89,8 @@ def upgrade() -> None:
        ),
    )

+
+def _create_folder_revisions() -> None:
    op.create_table(
        "folder_revisions",
        sa.Column("id", sa.Integer(), primary_key=True, index=True),
--- a/surfsense_backend/alembic/versions/132_add_agent_permission_rules.py
+++ b/surfsense_backend/alembic/versions/132_add_agent_permission_rules.py
@ -26,6 +26,10 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
+    bind = op.get_bind()
+    if sa.inspect(bind).has_table("agent_permission_rules"):
+        return
+
    op.create_table(
        "agent_permission_rules",
        sa.Column("id", sa.Integer(), primary_key=True, index=True),
--- a/surfsense_backend/alembic/versions/135_action_log_correlation_ids.py
+++ b/surfsense_backend/alembic/versions/135_action_log_correlation_ids.py
@ -50,29 +50,39 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
-    op.add_column(
-        "agent_action_log",
-        sa.Column("tool_call_id", sa.String(length=64), nullable=True),
-    )
-    op.add_column(
-        "agent_action_log",
-        sa.Column("chat_turn_id", sa.String(length=64), nullable=True),
-    )
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+    columns = {c["name"] for c in inspector.get_columns("agent_action_log")}
+    indexes = {i["name"] for i in inspector.get_indexes("agent_action_log")}

-    op.create_index(
-        "ix_agent_action_log_tool_call_id",
-        "agent_action_log",
-        ["tool_call_id"],
-    )
-    op.create_index(
-        "ix_agent_action_log_chat_turn_id",
-        "agent_action_log",
-        ["chat_turn_id"],
-    )
+    if "tool_call_id" not in columns:
+        op.add_column(
+            "agent_action_log",
+            sa.Column("tool_call_id", sa.String(length=64), nullable=True),
+        )
+    if "chat_turn_id" not in columns:
+        op.add_column(
+            "agent_action_log",
+            sa.Column("chat_turn_id", sa.String(length=64), nullable=True),
+        )

-    op.execute(
-        "UPDATE agent_action_log SET tool_call_id = turn_id WHERE tool_call_id IS NULL"
-    )
+    if "ix_agent_action_log_tool_call_id" not in indexes:
+        op.create_index(
+            "ix_agent_action_log_tool_call_id",
+            "agent_action_log",
+            ["tool_call_id"],
+        )
+    if "ix_agent_action_log_chat_turn_id" not in indexes:
+        op.create_index(
+            "ix_agent_action_log_chat_turn_id",
+            "agent_action_log",
+            ["chat_turn_id"],
+        )
+
+    if "turn_id" in columns:
+        op.execute(
+            "UPDATE agent_action_log SET tool_call_id = turn_id WHERE tool_call_id IS NULL"
+        )


 def downgrade() -> None:
--- a/surfsense_backend/alembic/versions/136_new_chat_message_turn_id.py
+++ b/surfsense_backend/alembic/versions/136_new_chat_message_turn_id.py
@ -36,15 +36,22 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
-    op.add_column(
-        "new_chat_messages",
-        sa.Column("turn_id", sa.String(length=64), nullable=True),
-    )
-    op.create_index(
-        "ix_new_chat_messages_turn_id",
-        "new_chat_messages",
-        ["turn_id"],
-    )
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+    columns = {c["name"] for c in inspector.get_columns("new_chat_messages")}
+    indexes = {i["name"] for i in inspector.get_indexes("new_chat_messages")}
+
+    if "turn_id" not in columns:
+        op.add_column(
+            "new_chat_messages",
+            sa.Column("turn_id", sa.String(length=64), nullable=True),
+        )
+    if "ix_new_chat_messages_turn_id" not in indexes:
+        op.create_index(
+            "ix_new_chat_messages_turn_id",
+            "new_chat_messages",
+            ["turn_id"],
+        )


 def downgrade() -> None:
--- a/surfsense_backend/alembic/versions/137_unique_reverse_of_in_action_log.py
+++ b/surfsense_backend/alembic/versions/137_unique_reverse_of_in_action_log.py
@ -27,6 +27,8 @@ from __future__ import annotations

 from collections.abc import Sequence

+import sqlalchemy as sa
+
 from alembic import op

 revision: str = "137"
@ -39,6 +41,11 @@ _INDEX_NAME = "ux_agent_action_log_reverse_of"


 def upgrade() -> None:
+    bind = op.get_bind()
+    indexes = {i["name"] for i in sa.inspect(bind).get_indexes("agent_action_log")}
+    if _INDEX_NAME in indexes:
+        return
+
    # Defensively de-dup any pre-existing double-revert rows before
    # adding the unique index. Keeps the OLDEST row (smallest id) and
    # NULLs out the duplicates' ``reverse_of`` so they survive as audit
--- a/surfsense_backend/alembic/versions/141_unique_chat_message_turn_role.py
+++ b/surfsense_backend/alembic/versions/141_unique_chat_message_turn_role.py
@ -53,6 +53,11 @@ TABLE_NAME = "new_chat_messages"


 def upgrade() -> None:
+    bind = op.get_bind()
+    indexes = {i["name"] for i in sa.inspect(bind).get_indexes(TABLE_NAME)}
+    if INDEX_NAME in indexes:
+        return
+
    op.create_index(
        INDEX_NAME,
        TABLE_NAME,
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -473,10 +473,16 @@ def initialize_vision_llm_router():
 class Config:
    # Check if ffmpeg is installed
    if not is_ffmpeg_installed():
-        import static_ffmpeg
+        allow_static_ffmpeg = (
+            os.getenv("SURFSENSE_ALLOW_STATIC_FFMPEG_DOWNLOAD", "TRUE").upper()
+            == "TRUE"
+        )
+        if allow_static_ffmpeg:
+            import static_ffmpeg
+
+            # ffmpeg installed on first call to add_paths(), threadsafe.
+            static_ffmpeg.add_paths()

-        # ffmpeg installed on first call to add_paths(), threadsafe.
-        static_ffmpeg.add_paths()
        # check if ffmpeg is installed again
        if not is_ffmpeg_installed():
            raise ValueError(
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -134,12 +134,92 @@ class EtlPipelineService:
        else:
            raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")

+        # When the operator opts into vision-LLM at ingest, walk the
+        # original file's embedded images and append a structured
+        # "Image Content" section. The parser's own OCR (Docling
+        # do_ocr=True, Azure DI prebuilt-read, etc.) handles text-in-
+        # image; this side handles the *visual* description which the
+        # parsers all drop today.
+        content = await self._maybe_append_picture_descriptions(request, content)
+
        return EtlResult(
            markdown_content=content,
            etl_service=etl_service,
            content_type="document",
        )

+    async def _maybe_append_picture_descriptions(
+        self, request: EtlRequest, markdown: str
+    ) -> str:
+        if self._vision_llm is None:
+            return markdown
+
+        from app.etl_pipeline.picture_describer import (
+            describe_pictures,
+            merge_descriptions_into_markdown,
+        )
+
+        # Per-image OCR runner: re-feed each extracted image through
+        # the ETL pipeline *as a standalone image* (no vision LLM, so
+        # the IMAGE branch falls through to the document parser, which
+        # OCRs the image with the configured backend -- Docling /
+        # Azure DI / LlamaCloud). This gives us per-image OCR text
+        # attached to the inline image block, in addition to the
+        # page-level OCR that the parser already merges into the main
+        # markdown stream. The fresh sub-service gets vision_llm=None
+        # so this call cannot recurse back into picture_describer.
+        async def _ocr_image(image_path: str, image_name: str) -> str:
+            try:
+                sub = EtlPipelineService(vision_llm=None)
+                ocr_result = await sub.extract(
+                    EtlRequest(file_path=image_path, filename=image_name)
+                )
+            except (
+                EtlUnsupportedFileError,
+                EtlServiceUnavailableError,
+            ) as exc:
+                # Common case: the configured ETL service can't OCR
+                # this image format (or no service is configured at
+                # all). Don't spam warnings -- just no OCR for it.
+                logging.debug(
+                    "Skipping per-image OCR for %s: %s", image_name, exc
+                )
+                return ""
+            return ocr_result.markdown_content
+
+        try:
+            result = await describe_pictures(
+                request.file_path,
+                request.filename,
+                self._vision_llm,
+                ocr_runner=_ocr_image,
+            )
+        except Exception:
+            # Picture description is additive; never let it fail an
+            # otherwise-successful document extraction.
+            logging.warning(
+                "Picture description failed for %s, returning parser output unchanged",
+                request.filename,
+                exc_info=True,
+            )
+            return markdown
+
+        if not result.descriptions:
+            return markdown
+
+        merged = merge_descriptions_into_markdown(markdown, result)
+        logging.info(
+            "Vision LLM described %d image(s) in %s "
+            "(skipped: %d small / %d large / %d duplicate, %d failed)",
+            len(result.descriptions),
+            request.filename,
+            result.skipped_too_small,
+            result.skipped_too_large,
+            result.skipped_duplicate,
+            result.failed,
+        )
+        return merged
+
    async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
        """Try Azure Document Intelligence first (when configured) then LlamaCloud.

--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@ -4,12 +4,34 @@ import os

 from langchain_core.messages import HumanMessage

+# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
+# A standalone image IS the document, so we want everything: visual
+# content plus any text the model can read off it. The output is
+# combined markdown that the chunker treats as the full document body.
 _PROMPT = (
    "Describe this image in markdown. "
    "Transcribe any visible text verbatim. "
    "Be concise but complete — let the image content guide the level of detail."
 )

+# Per-image-in-PDF prompt. Here the image is *inside* a larger
+# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
+# already running OCR over the whole page — including text rendered
+# into images. So we explicitly tell the model NOT to transcribe text
+# and to focus only on visual interpretation. This avoids paying
+# output tokens for OCR content the ETL pipeline already captured.
+_DESCRIPTION_PROMPT = (
+    "Describe what this image visually depicts in concise markdown. "
+    "Focus on visual content — anatomy, structures, charts, diagrams, "
+    "spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
+    "histology slide), and any clinically or structurally relevant "
+    "findings.\n\n"
+    "Do NOT transcribe text from the image. Any text in the image "
+    "(axis labels, annotations, scale bars, lab values, etc.) is "
+    "already extracted by a separate OCR pipeline; duplicating it "
+    "here would be redundant. Stick to the visual interpretation."
+)
+
 _MAX_IMAGE_BYTES = (
    5 * 1024 * 1024
 )  # 5 MB (Anthropic Claude's limit, the most restrictive)
@ -47,11 +69,10 @@ def _image_to_data_url(file_path: str) -> str:
    return f"data:{mime_type};base64,{encoded}"


-async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
-    data_url = _image_to_data_url(file_path)
+async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
    message = HumanMessage(
        content=[
-            {"type": "text", "text": _PROMPT},
+            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": data_url}},
        ]
    )
@ -62,3 +83,36 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
    if not text or not text.strip():
        raise ValueError(f"Vision LLM returned empty content for {filename}")
    return text.strip()
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    """Single-shot: returns combined markdown for a standalone image upload.
+
+    Used when the operator uploads an image file directly (jpg/png/etc).
+    The image is the document, so the prompt asks for both visual
+    description and verbatim text in one go.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _PROMPT, data_url, filename)
+
+
+async def parse_image_for_description(
+    file_path: str, filename: str, llm
+) -> str:
+    """Visual-description-only call for per-image-in-PDF use.
+
+    Used by ``picture_describer`` when an image is embedded inside a
+    larger document. Returns a markdown description of what the image
+    visually depicts; deliberately does NOT include text-in-image OCR
+    because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
+    already running OCR over the entire page and would duplicate that
+    text content.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
+
+
+__all__ = [
+    "parse_image_for_description",
+    "parse_with_vision_llm",
+]
--- a/surfsense_backend/app/etl_pipeline/picture_describer.py
+++ b/surfsense_backend/app/etl_pipeline/picture_describer.py
@ -0,0 +1,678 @@
+"""Extract embedded images from PDFs, describe them, and inject the
+descriptions inline into the parser's markdown.
+
+When the operator passes ``use_vision_llm=True`` for a PDF, the document
+parsers (DOCLING / LLAMACLOUD / Azure DI / UNSTRUCTURED) extract text
+but mostly drop the actual image content -- a CT scan inside a clinical
+PDF becomes (at best) a ``<!-- image -->`` placeholder in the markdown,
+and the caption text below it.
+
+This module fills that gap. After the document parser produces markdown
+text, we:
+
+1. Walk the original PDF with :mod:`pypdf`, pulling out each embedded
+   image (deduped by sha256, size-capped to match the vision LLM's own
+   limits).
+2. Run the vision LLM on each unique image (visual description) and,
+   in parallel when an OCR runner is provided, re-feed the same image
+   through the ETL service for per-image OCR.
+3. **Inject** a horizontal-rule-delimited markdown section -- with
+   named "OCR text" and "Visual description" sub-sections -- where the
+   image actually appears in the parser's markdown. Two splice modes,
+   chosen by which marker the parser emitted:
+
+   - **Replace** Docling-style ``<!-- image -->`` placeholders (and an
+     optional ``Image: <filename>`` caption line). The placeholder
+     carries no useful content of its own, so we substitute our block
+     for it.
+   - **Append after** layout-aware ``<figure>...</figure>`` blocks
+     (Azure DI ``prebuilt-layout``, LlamaCloud premium). Those blocks
+     already contain parser-extracted chart values / OCR'd labels /
+     captions, which are themselves useful for retrieval -- so we
+     PRESERVE the figure verbatim and add our vision-LLM block
+     immediately after it. The chunk then contains both the parser's
+     structured numbers AND the VLM's semantic interpretation.
+
+   Either way, the image content stays in context with the surrounding
+   document body rather than getting orphaned at the end -- crucial for
+   retrieval, where a single chunk should contain the question, the
+   image content, and the answer options together.
+
+If no placeholders, figures, or captions can be matched (e.g. an
+unusual parser output), we fall back to appending an
+``## Image Content`` section so no image content is silently lost.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import hashlib
+import logging
+import re
+import tempfile
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+# Type alias for the OCR callback. Takes (file_path, filename), returns
+# the OCR'd markdown text -- or empty string if no text was found, or
+# raises if OCR failed unrecoverably (which the describer catches and
+# treats as "no OCR for this image" rather than failing the whole doc).
+OcrRunner = Callable[[str, str], Awaitable[str]]
+
+logger = logging.getLogger(__name__)
+
+
+# Bound how many vision LLM calls we make in parallel for a single
+# document. Vision models are typically rate-limited; 4 concurrent
+# calls is a safe default that respects most provider limits while
+# keeping wall-clock manageable for image-heavy PDFs.
+_VISION_CONCURRENCY = 4
+
+# Match parse_with_vision_llm's per-image cap so we don't even attempt
+# images that the vision LLM would reject anyway (Anthropic's 5 MB
+# limit is the most restrictive among the major providers).
+_MAX_IMAGE_BYTES = 5 * 1024 * 1024
+
+# Skip degenerate images: tracking pixels, very small decorative dots,
+# scanner-introduced artefacts. We can't cheaply check pixel dimensions
+# without decoding the image, so we approximate: anything under 1 KB is
+# almost certainly not informative content.
+_MIN_IMAGE_BYTES = 1024
+
+
+@dataclass
+class PictureDescription:
+    """A single extracted image with its visual description and (optionally) OCR.
+
+    Two content fields by design, each produced by the *right* tool:
+
+    - ``description``: the vision LLM's visual interpretation. What the
+      image depicts (anatomy, charts, layout, etc.) -- the semantic
+      content that only a vision model can produce.
+    - ``ocr_text``: text-in-image extracted by re-feeding the image
+      through the configured ETL service (Docling/Azure DI/LlamaCloud)
+      *as if it were a standalone image upload*. Specialist OCR engine,
+      per-image attribution, no vision LLM tokens spent on text. None
+      when no OCR was requested or OCR found no text.
+    """
+
+    page_number: int                # 1-indexed
+    ordinal_in_page: int            # 0-indexed within the page
+    name: str                       # name pypdf assigned (e.g. "Im0")
+    sha256: str                     # hash of the raw image bytes
+    description: str                # visual description (markdown)
+    ocr_text: str | None = None     # OCR text from the ETL service, if any
+
+
+@dataclass
+class PictureExtractionResult:
+    """Aggregate result of extracting all pictures from a document."""
+
+    descriptions: list[PictureDescription] = field(default_factory=list)
+    skipped_too_small: int = 0
+    skipped_too_large: int = 0
+    skipped_duplicate: int = 0
+    failed: int = 0
+
+    @property
+    def has_content(self) -> bool:
+        return bool(self.descriptions)
+
+
+def _is_pdf(filename: str) -> bool:
+    return filename.lower().endswith(".pdf")
+
+
+def _pick_suffix(name: str) -> str:
+    lower = name.lower()
+    for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp"):
+        if lower.endswith(ext):
+            return ".jpeg" if ext == ".jpg" else ext
+    return ".png"
+
+
+def _extract_pdf_images(file_path: str) -> list[tuple[int, int, str, bytes]]:
+    """Pull every embedded image out of a PDF.
+
+    Returns ``(page_number_1_indexed, ordinal_in_page, name, bytes)``.
+    Per-page and per-image failures are logged and skipped -- one bad
+    image must not fail the whole document.
+    """
+
+    from pypdf import PdfReader
+
+    out: list[tuple[int, int, str, bytes]] = []
+    try:
+        reader = PdfReader(file_path)
+    except Exception:
+        logger.warning(
+            "pypdf failed to open %s for image extraction",
+            file_path,
+            exc_info=True,
+        )
+        return out
+
+    for page_idx, page in enumerate(reader.pages):
+        try:
+            images = list(page.images)
+        except Exception:
+            logger.warning(
+                "pypdf failed to enumerate images on page %d of %s",
+                page_idx + 1,
+                file_path,
+                exc_info=True,
+            )
+            continue
+        for img_idx, img in enumerate(images):
+            try:
+                name = getattr(img, "name", None) or f"page{page_idx + 1}_img{img_idx}"
+                data = img.data
+            except Exception:
+                logger.warning(
+                    "pypdf failed to read image %d on page %d of %s",
+                    img_idx,
+                    page_idx + 1,
+                    file_path,
+                    exc_info=True,
+                )
+                continue
+            out.append((page_idx + 1, img_idx, name, data))
+    return out
+
+
+async def _describe_one(
+    page_number: int,
+    ordinal: int,
+    name: str,
+    sha256: str,
+    data: bytes,
+    vision_llm: Any,
+    semaphore: asyncio.Semaphore,
+    ocr_runner: OcrRunner | None,
+) -> PictureDescription | None:
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    suffix = _pick_suffix(name)
+    # NamedTemporaryFile + delete=False because the vision-LLM helper
+    # and the OCR runner each open the path themselves; we clean up in
+    # the finally. Same temp file feeds both, which is correct: vision
+    # LLM and OCR are looking at the same image, just asking different
+    # questions of it.
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        tmp.write(data)
+        tmp_path = tmp.name
+    try:
+        async with semaphore:
+            tasks: list[Awaitable[Any]] = [
+                parse_image_for_description(tmp_path, name, vision_llm),
+            ]
+            if ocr_runner is not None:
+                tasks.append(ocr_runner(tmp_path, name))
+
+            # return_exceptions=True so a failure in one branch (most
+            # often OCR) doesn't poison the other.
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        description_result = results[0]
+        if isinstance(description_result, BaseException):
+            logger.warning(
+                "Vision LLM failed for image %s on page %d, skipping",
+                name,
+                page_number,
+                exc_info=description_result,
+            )
+            return None
+        description = str(description_result)
+
+        ocr_text: str | None = None
+        if ocr_runner is not None and len(results) > 1:
+            ocr_result = results[1]
+            if isinstance(ocr_result, BaseException):
+                logger.warning(
+                    "Per-image OCR failed for image %s on page %d, "
+                    "omitting OCR field for this image",
+                    name,
+                    page_number,
+                    exc_info=ocr_result,
+                )
+            else:
+                stripped = str(ocr_result).strip()
+                # Empty OCR (or whitespace-only) means the OCR engine
+                # found no text in this image. Record that as None so
+                # the rendered block doesn't include a useless empty tag.
+                ocr_text = stripped or None
+    finally:
+        with contextlib.suppress(OSError):
+            Path(tmp_path).unlink()
+
+    return PictureDescription(
+        page_number=page_number,
+        ordinal_in_page=ordinal,
+        name=name,
+        sha256=sha256,
+        description=description,
+        ocr_text=ocr_text,
+    )
+
+
+async def describe_pictures(
+    file_path: str,
+    filename: str,
+    vision_llm: Any,
+    *,
+    ocr_runner: OcrRunner | None = None,
+) -> PictureExtractionResult:
+    """Extract embedded images from a document and describe each via vision LLM.
+
+    When ``ocr_runner`` is provided, each image is also passed to it
+    (in parallel with the vision LLM) and the returned text is recorded
+    in :attr:`PictureDescription.ocr_text`. The runner is typically a
+    closure over a vision-LLM-less ``EtlPipelineService`` -- this lets
+    the same OCR engine that processes standalone image uploads
+    (Docling/Azure DI/LlamaCloud) also process embedded-in-PDF images,
+    giving per-image OCR attribution alongside the page-level OCR that
+    the parser already does.
+
+    Currently PDF-only. For non-PDF documents this returns an empty
+    result and the caller should leave the parser's markdown untouched.
+    """
+
+    result = PictureExtractionResult()
+    if not _is_pdf(filename) or vision_llm is None:
+        return result
+
+    raw_images = _extract_pdf_images(file_path)
+    if not raw_images:
+        return result
+
+    seen_hashes: set[str] = set()
+    eligible: list[tuple[int, int, str, str, bytes]] = []
+    for page_number, ordinal, name, data in raw_images:
+        if len(data) > _MAX_IMAGE_BYTES:
+            result.skipped_too_large += 1
+            continue
+        if len(data) < _MIN_IMAGE_BYTES:
+            result.skipped_too_small += 1
+            continue
+        sha = hashlib.sha256(data).hexdigest()
+        if sha in seen_hashes:
+            result.skipped_duplicate += 1
+            continue
+        seen_hashes.add(sha)
+        eligible.append((page_number, ordinal, name, sha, data))
+
+    if not eligible:
+        return result
+
+    semaphore = asyncio.Semaphore(_VISION_CONCURRENCY)
+    tasks = [
+        _describe_one(p, o, n, sha, d, vision_llm, semaphore, ocr_runner)
+        for (p, o, n, sha, d) in eligible
+    ]
+    descriptions = await asyncio.gather(*tasks)
+    for desc in descriptions:
+        if desc is None:
+            result.failed += 1
+        else:
+            result.descriptions.append(desc)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Rendering: build the per-image markdown block + inject inline.
+# ---------------------------------------------------------------------------
+
+
+def _format_image_block(
+    name: str,
+    description: str,
+    ocr_text: str | None = None,
+) -> str:
+    """Render the per-image block as a horizontal-rule-delimited section.
+
+    Why no blockquote / no raw HTML / no XML?
+    -----------------------------------------
+    We tried each in turn and each failed in the document viewer:
+
+    - **Raw HTML / XML** (``<image>...</image>``): unknown elements
+      have no render rules in Streamdown or PlateJS, so the content
+      survives in the markdown source but is invisible to humans.
+    - **Blockquote with nested blocks**: nested fenced code blocks,
+      bullet lists, numbered lists, tables -- any *block* element
+      inside a ``>``-prefixed blockquote -- gets evicted by Streamdown
+      / remark, dropping everything after it onto the document level.
+      The vision LLM happily produces bulleted descriptions, so this
+      hit the viewer in practice.
+
+    A horizontal-rule-delimited section, by contrast, contains only
+    standard top-level markdown -- bold labels and free-form body --
+    so the description's native markdown (lists, prose, tables) all
+    renders natively in every renderer.
+
+    Layout (OCR section omitted when ``ocr_text`` is None/empty):
+
+        ---
+
+        **Embedded image:** `MM-130-a.jpeg`
+
+        **OCR text:**
+        Slice 24 / 60
+        L
+        R
+
+        **Visual description:**
+
+        - Axial contrast-enhanced CT showing a large cystic mass...
+        - Mass effect on the adjacent stomach.
+
+        ---
+
+    Still LLM-friendly: the ``**Embedded image:** `<filename>``` prefix
+    is unique and trivially regex-able (``^\\*\\*Embedded image:\\*\\* `(.+?)`$``).
+
+    Returned with leading and trailing blank-line padding so the rules
+    never merge with adjacent paragraphs after splicing.
+    """
+
+    parts: list[str] = [f"**Embedded image:** `{name}`"]
+
+    if ocr_text and ocr_text.strip():
+        # Bold "OCR text:" label with trailing two spaces (=> <br>) so
+        # the first OCR line sits directly under the label rather than
+        # forcing a paragraph break that some renderers would style
+        # differently. Subsequent OCR lines also use trailing two spaces
+        # for hard breaks, so multi-line OCR renders line-by-line
+        # without needing a (fragile) fenced code block.
+        ocr_clean_lines = [
+            ln.rstrip() for ln in ocr_text.strip().splitlines() if ln.strip()
+        ]
+        parts.append("")
+        parts.append("**OCR text:**  ")
+        for i, raw in enumerate(ocr_clean_lines):
+            suffix = "" if i == len(ocr_clean_lines) - 1 else "  "
+            parts.append(f"{raw}{suffix}")
+
+    parts.append("")
+    parts.append("**Visual description:**")
+    parts.append("")
+    parts.append(description.strip())
+
+    body = "\n".join(parts)
+    # Wrap with blank lines + horizontal rules so the block is clearly
+    # delimited from surrounding paragraphs and survives splicing into
+    # the middle of any markdown stream.
+    return "\n\n---\n\n" + body + "\n\n---\n\n"
+
+
+# Patterns we'll try to splice into. Each pattern captures the
+# original-PDF filename when one is available (group 1).
+#
+# Replace-style markers (the matched span is substituted with our block
+# because it carries no useful content of its own):
+#
+# 1. Docling's image placeholder followed by an "Image: <filename>"
+#    caption line. This is what our medxpertqa renderer produces:
+#    reportlab places the JPEG, then a caption, and Docling outputs
+#    the placeholder + caption.
+# 2. Docling's image placeholder alone (filename unknown -- we fall
+#    back to pypdf's name).
+# 3. A bare "Image: <filename>" caption line with no preceding
+#    placeholder. Rare in practice, but covers parsers that drop the
+#    placeholder entirely.
+_PLACEHOLDER_WITH_CAPTION = re.compile(
+    r"<!--\s*image\s*-->\s*\n\s*Image:\s*(\S+)\s*(?:\n|$)",
+    re.IGNORECASE,
+)
+_PLACEHOLDER_ONLY = re.compile(
+    r"<!--\s*image\s*-->",
+    re.IGNORECASE,
+)
+_CAPTION_ONLY = re.compile(
+    r"^[ \t]*Image:\s*(\S+)\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+# Append-after marker (the matched span is preserved verbatim and our
+# block is inserted immediately after it):
+#
+# 4. ``<figure>...</figure>`` as emitted by layout-aware parsers (Azure
+#    Document Intelligence ``prebuilt-layout``, LlamaCloud premium).
+#    The figure's own contents -- chart bar values, axis labels,
+#    inline ``<figcaption>``, embedded ``<table>`` for tabular figures
+#    -- are themselves specialist OCR output, so we keep them and add
+#    our vision-LLM block alongside. ``[^>]*`` in the open tag tolerates
+#    optional attributes like ``<figure id="...">``; ``re.DOTALL``
+#    lets ``.`` cross the newlines inside the block.
+_FIGURE_BLOCK = re.compile(
+    r"<figure\b[^>]*>.*?</figure>",
+    re.DOTALL | re.IGNORECASE,
+)
+
+
+def _replace_one_match(
+    markdown: str,
+    pattern: re.Pattern[str],
+    descriptions: list[PictureDescription],
+    desc_idx: int,
+) -> tuple[str, int]:
+    """Replace the first occurrence of ``pattern`` with the next image block.
+
+    Returns the new markdown and the new ``desc_idx`` (advanced if a
+    replacement happened, unchanged otherwise).
+    """
+
+    if desc_idx >= len(descriptions):
+        return markdown, desc_idx
+
+    match = pattern.search(markdown)
+    if not match:
+        return markdown, desc_idx
+
+    desc = descriptions[desc_idx]
+    captured_name: str | None = None
+    if match.groups():
+        captured_name = match.group(1)
+    name = captured_name or desc.name
+    block = _format_image_block(name, desc.description, desc.ocr_text)
+
+    new_markdown = markdown[: match.start()] + block + markdown[match.end():]
+    return new_markdown, desc_idx + 1
+
+
+def _splice_after_figures(
+    markdown: str,
+    descriptions: list[PictureDescription],
+    desc_idx: int,
+) -> tuple[str, int]:
+    """Append vision-LLM blocks immediately after each ``<figure>...</figure>``.
+
+    Layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
+    premium) wrap each figure / chart / inline table in this tag and
+    carry their own OCR of the figure's text content inside it. That
+    content is useful on its own, so we keep the original block
+    verbatim and add our vision-LLM block right after it -- giving
+    retrieval both signals in the same chunk.
+
+    Descriptions are matched to figures in document order (first
+    description -> first figure, etc.). All splice points are computed
+    upfront with :func:`re.finditer` and applied in REVERSE order so
+    earlier offsets stay valid as the markdown grows. Returns the
+    advanced ``desc_idx`` for the caller's leftover-handling.
+    """
+
+    if desc_idx >= len(descriptions):
+        return markdown, desc_idx
+
+    matches = list(_FIGURE_BLOCK.finditer(markdown))
+    if not matches:
+        return markdown, desc_idx
+
+    n_to_splice = min(len(matches), len(descriptions) - desc_idx)
+    if n_to_splice <= 0:
+        return markdown, desc_idx
+
+    out = markdown
+    # Walk in reverse so each splice's end-offset still points at the
+    # right place in the (still-mutating) string.
+    for i in range(n_to_splice - 1, -1, -1):
+        match = matches[i]
+        desc = descriptions[desc_idx + i]
+        block = _format_image_block(desc.name, desc.description, desc.ocr_text)
+        out = out[: match.end()] + block + out[match.end():]
+
+    return out, desc_idx + n_to_splice
+
+
+def inject_descriptions_inline(
+    markdown: str,
+    result: PictureExtractionResult,
+) -> tuple[str, int]:
+    """Splice per-image markdown blocks into the document at image positions.
+
+    Walks the markdown left-to-right, consuming descriptions in order.
+    Tries two splicing strategies, in this order:
+
+    1. **Append-after** for ``<figure>...</figure>`` blocks emitted by
+       layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
+       premium). The figure block carries the parser's own OCR of the
+       figure -- we preserve it and add our vision-LLM block right
+       after.
+    2. **Replace** for Docling-style markers, in priority order:
+
+       - ``<!-- image -->`` followed by ``Image: <filename>`` caption,
+       - ``<!-- image -->`` placeholder alone,
+       - bare ``Image: <filename>`` caption.
+
+    A document typically uses one style or the other (depending on
+    which parser produced its markdown), so the two paths don't fight
+    each other in practice. When they do co-occur, figures are
+    consumed first.
+
+    Returns ``(new_markdown, n_inlined)`` -- the count of descriptions
+    that were placed inline. The caller decides what to do with any
+    leftover descriptions (typically: append them at the end).
+    """
+
+    if not result.descriptions:
+        return markdown, 0
+
+    descriptions = result.descriptions
+    desc_idx = 0
+    out = markdown
+
+    # Step 1: layout-aware figures. One-shot batch -- finds ALL
+    # <figure> blocks, splices in document order until we exhaust
+    # either side.
+    out, desc_idx = _splice_after_figures(out, descriptions, desc_idx)
+
+    # Step 2: Docling-style replacement markers. One match per
+    # iteration, so a doc that has both a figure (consumed above) and
+    # a Docling placeholder (consumed below) still works.
+    while desc_idx < len(descriptions):
+        before_idx = desc_idx
+        out, desc_idx = _replace_one_match(
+            out, _PLACEHOLDER_WITH_CAPTION, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        out, desc_idx = _replace_one_match(
+            out, _PLACEHOLDER_ONLY, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        out, desc_idx = _replace_one_match(
+            out, _CAPTION_ONLY, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        # No more positions to splice into.
+        break
+
+    return out, desc_idx
+
+
+def render_appended_section(
+    descriptions: list[PictureDescription],
+    *,
+    skip_notes: PictureExtractionResult | None = None,
+    heading: str = "## Image Content (vision-LLM extracted)",
+) -> str:
+    """Render leftover descriptions as an appended section.
+
+    Used as a fallback when not every description could be inlined
+    (either because the parser produced no detectable image markers,
+    or because there were more extracted images than markers).
+    """
+
+    if not descriptions and not skip_notes:
+        return ""
+
+    parts: list[str] = ["", heading, ""]
+    for desc in descriptions:
+        parts.append(
+            _format_image_block(desc.name, desc.description, desc.ocr_text)
+        )
+        parts.append("")
+
+    if skip_notes is not None:
+        notes: list[str] = []
+        if skip_notes.skipped_too_large:
+            notes.append(f"{skip_notes.skipped_too_large} too large (> 5 MB)")
+        if skip_notes.skipped_too_small:
+            notes.append(f"{skip_notes.skipped_too_small} too small (< 1 KB)")
+        if skip_notes.skipped_duplicate:
+            notes.append(f"{skip_notes.skipped_duplicate} duplicate")
+        if skip_notes.failed:
+            notes.append(f"{skip_notes.failed} failed")
+        if notes:
+            parts.append(f"_Note: {', '.join(notes)} image(s) skipped._")
+
+    return "\n".join(parts)
+
+
+def merge_descriptions_into_markdown(
+    markdown: str,
+    result: PictureExtractionResult,
+) -> str:
+    """Top-level: inline what we can, append what's left over.
+
+    This is the function the ETL pipeline actually calls. It guarantees
+    that no successfully-described image is silently dropped: anything
+    we can't splice inline gets appended at the end with a heading
+    that makes it clear those came from the document but weren't
+    location-matched.
+    """
+
+    if not result.descriptions:
+        return markdown
+
+    new_markdown, n_inlined = inject_descriptions_inline(markdown, result)
+    leftover = result.descriptions[n_inlined:]
+
+    if not leftover:
+        return new_markdown
+
+    # Distinguish in the heading whether NONE were inlined (parser
+    # produced no markers at all) vs SOME (mismatched count).
+    heading = (
+        "## Image Content (vision-LLM extracted)"
+        if n_inlined == 0
+        else "## Image Content (additional, no inline marker found)"
+    )
+    section = render_appended_section(leftover, heading=heading)
+    if not section:
+        return new_markdown
+    return f"{new_markdown.rstrip()}\n\n{section.lstrip()}\n"
+
+
+__all__ = [
+    "PictureDescription",
+    "PictureExtractionResult",
+    "describe_pictures",
+    "inject_descriptions_inline",
+    "merge_descriptions_into_markdown",
+    "render_appended_section",
+]
--- a/surfsense_backend/app/services/docling_service.py
+++ b/surfsense_backend/app/services/docling_service.py
@ -77,10 +77,16 @@ class DoclingService:
            # Create pipeline options with version-safe attribute checking
            pipeline_options = PdfPipelineOptions()

-            # Disable OCR (user request)
+            # Enable OCR so text-in-image (chart axes, ECG annotations,
+            # lab tables embedded as images, scanned pages, etc.) is
+            # lifted into the main markdown stream. This pairs with the
+            # vision-LLM picture-description pass downstream — OCR
+            # captures literal text; vision LLM captures the visual
+            # content. Together they give a faithful representation of
+            # PDFs that mix text and images.
            if hasattr(pipeline_options, "do_ocr"):
-                pipeline_options.do_ocr = False
-                logger.info("⚠️ OCR disabled by user request")
+                pipeline_options.do_ocr = True
+                logger.info("✅ OCR enabled for embedded text-in-image extraction")
            else:
                logger.warning("⚠️ OCR attribute not available in this Docling version")

--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -123,10 +123,6 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
    """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    from app.etl_pipeline.file_classifier import (
-        FileCategory,
-        classify_file as etl_classify,
-    )

    await _notify(ctx, "parsing", "Processing file")
    await ctx.task_logger.log_task_progress(
@ -135,8 +131,12 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
        {"processing_stage": "extracting"},
    )

+    # Fetch the vision LLM whenever the operator opts in. The ETL
+    # pipeline decides what to do with it: image files run through the
+    # vision LLM directly; document files (PDFs) get per-image
+    # descriptions appended via picture_describer.
    vision_llm = None
-    if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
+    if ctx.use_vision_llm:
        from app.services.llm_service import get_vision_llm

        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
@ -230,7 +230,16 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:

    await _notify(ctx, "parsing", "Extracting content")

-    etl_result = await EtlPipelineService().extract(
+    # Document files (PDF, docx, etc.) get vision LLM treatment too:
+    # the ETL pipeline appends a per-image description section when
+    # vision_llm is provided. See picture_describer.describe_pictures.
+    vision_llm = None
+    if ctx.use_vision_llm:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
+
+    etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(
            file_path=ctx.file_path,
            filename=ctx.filename,
@ -418,8 +427,12 @@ async def _extract_file_content(
        billable_pages = estimated_pages * mode.page_multiplier
        await page_limit_service.check_page_limit(user_id, billable_pages)

+    # Vision LLM is provided to the ETL pipeline for any file category
+    # when the operator opts in. Image files run through it directly;
+    # document files (PDFs) get per-image descriptions appended via
+    # picture_describer.
    vision_llm = None
-    if use_vision_llm and category == FileCategory.IMAGE:
+    if use_vision_llm:
        from app.services.llm_service import get_vision_llm

        vision_llm = await get_vision_llm(session, search_space_id)
--- a/surfsense_backend/scripts/docker/entrypoint.e2e.sh
+++ b/surfsense_backend/scripts/docker/entrypoint.e2e.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+# =============================================================================
+# E2E entrypoint for the multi-stage Dockerfile's `e2e` target.
+#
+# Dispatches on SERVICE_ROLE to the test-only entrypoints under tests/e2e/.
+# Those scripts apply sys.modules hijacks and LLM/embedding patches BEFORE
+# importing production app code (see tests/e2e/run_backend.py for rationale).
+#
+# Production never sees this file: tests/ is excluded from the production
+# stage, and the production stage uses scripts/docker/entrypoint.sh.
+# =============================================================================
+set -euo pipefail
+
+SERVICE_ROLE="${SERVICE_ROLE:-api}"
+echo "[e2e-entrypoint] starting role=${SERVICE_ROLE}"
+
+wait_for_db() {
+    # Block until the database is reachable. We don't loop forever — Compose
+    # depends_on/healthchecks already gate on db readiness, this is just
+    # belt-and-suspenders so a slow first connection doesn't race migrations.
+    for i in {1..60}; do
+        echo "[e2e-entrypoint] db check attempt ${i}/60"
+        if python -c "from app.db import engine; import asyncio; asyncio.run(engine.dispose())"; then
+            echo "[e2e-entrypoint] db reachable after ${i} attempts"
+            return 0
+        fi
+        sleep 1
+    done
+    echo "[e2e-entrypoint] ERROR: db not reachable after 60s" >&2
+    return 1
+}
+
+case "${SERVICE_ROLE}" in
+    api)
+        wait_for_db
+        echo "[e2e-entrypoint] running alembic upgrade head"
+        alembic upgrade head
+        # `exec` so SIGTERM from `docker stop` reaches Python directly,
+        # without a shell wrapper interposing.
+        exec python tests/e2e/run_backend.py
+        ;;
+    worker)
+        # Worker doesn't run migrations — the api role does that exactly once.
+        # We still wait for db so Celery's broker connection check doesn't
+        # race against an unready Postgres on cold start.
+        wait_for_db
+        exec python tests/e2e/run_celery.py
+        ;;
+    *)
+        echo "[e2e-entrypoint] ERROR: unknown SERVICE_ROLE='${SERVICE_ROLE}' (expected: api | worker)" >&2
+        exit 1
+        ;;
+esac
--- a/surfsense_backend/tests/e2e/README.md
+++ b/surfsense_backend/tests/e2e/README.md
@ -1,48 +1,48 @@
-# Backend E2E Test Harness
+# Backend E2E Harness

-Strict fakes + alternative entrypoints used **only** by Playwright E2E.
-Excluded from the production Docker image via `.dockerignore`.
+This directory contains the test-only backend entrypoints and fakes used by
+Playwright. They are not part of the production image: `.dockerignore` excludes
+`tests/`, and the E2E Docker stage copies this directory through a separate
+build context.

 ## Files

-| Path                             | Role                                                                            |
-| -------------------------------- | ------------------------------------------------------------------------------- |
-| `run_backend.py`                 | FastAPI entrypoint that hijacks `sys.modules` before importing `app.app:app`    |
-| `run_celery.py`                  | Celery worker entrypoint with the same hijack + patch logic                     |
-| `middleware/scenario.py`         | `X-E2E-Scenario` header → ContextVar (read by fakes)                            |
-| `fakes/composio_module.py`       | Strict drop-in for the `composio` package; raises on unknown surface            |
-| `fakes/llm.py`                   | `fake_get_user_long_context_llm` returning a `FakeListChatModel`                |
-| `fakes/embeddings.py`            | Deterministic 0.1-vector `embed_text` / `embed_texts`                           |
-| `fakes/fixtures/drive_files.json`| Canned Drive listings + file contents (incl. canary tokens)                     |
+| Path | Purpose |
+| --- | --- |
+| `run_backend.py` | Starts FastAPI after installing the test fakes into `sys.modules`. |
+| `run_celery.py` | Starts the Celery worker with the same fake setup. |
+| `middleware/scenario.py` | Reads `X-E2E-Scenario` into a request-scoped context var. |
+| `fakes/composio_module.py` | Fake `composio` package used by connector flows. |
+| `fakes/llm.py` | Fake chat model factory. |
+| `fakes/embeddings.py` | Deterministic embedding helpers. |
+| `fakes/fixtures/drive_files.json` | Drive fixture data and canary file contents. |

-## Why a sys.modules hijack?
+## Why the import hook exists

-Production code does `from composio import Composio` at module load
-time. By the time the FastAPI app object exists, that binding has
-already been resolved. The hijack runs **before** any `app.*` import,
-so the binding resolves to our strict fake. No production source
-changes; fakes are physically excluded from production images.
+Some production modules import SDK clients at module load time, for example
+`from composio import Composio`. By the time `app.app` has been imported, those
+bindings are already fixed.

-Belt + suspenders + no internet: the strict `__getattr__` in every
-fake raises `NotImplementedError` if a future production code path
-introduces a new SDK call. CI also sets `HTTPS_PROXY=http://127.0.0.1:1`
-plus sentinel API keys so any leaked outbound HTTP fails immediately.
+The E2E entrypoints install fake modules in `sys.modules` before importing any
+`app.*` module. That lets the normal production code run while SDK calls resolve
+to local fakes.

-## Adding a new fake
+The fakes should fail loudly. If production starts using a new SDK method that
+the fake does not implement, add that method to the fake instead of letting the
+test call the real service.

-1. Create `fakes/<sdk>_module.py` modelled on `composio_module.py`.
-2. In `run_backend.py` and `run_celery.py`, register
-   `sys.modules["<sdk>"] = _fake_<sdk>` before the `from app.app import app`
-   line.
-3. If the new fake needs scenario branching, read from
+## Adding a fake
+
+1. Add `fakes/<sdk>_module.py`.
+2. Register it in both `run_backend.py` and `run_celery.py` before importing
+   `app.app` or `app.celery_app`.
+3. If the fake needs per-test behavior, read the current scenario from
   `tests.e2e.middleware.scenario.current_scenario()`.

-## Reused by backend integration tests
+## Shared with backend integration tests

-The strict fakes are not only for Playwright. Backend route integration
-tests can import the same fake before importing `app.app`, so Composio
-route tests exercise production route code without touching the real
-SDK:
+Backend integration tests can use the same fakes when they need production route
+code without the real SDK:

 ```python
 from tests.e2e.fakes import composio_module as _fake_composio
@ -50,20 +50,93 @@ sys.modules["composio"] = _fake_composio
 from app.app import app
 ```

-See `surfsense_backend/tests/integration/composio/conftest.py` for the
-current pattern.
+See `surfsense_backend/tests/integration/composio/conftest.py` for the current
+pattern.

 ## Running locally

+The recommended local flow runs only Postgres and Redis in Docker, and the
+backend + Celery worker on the host. No `.env` file is required: both
+entrypoints `setdefault` every variable they need (DB URL, Redis URL,
+sentinel API keys, etc.) to values that match `docker-compose.deps-only.yml`.
+
+### One-time setup
+
+From `surfsense_web/`:
+
 ```bash
-cd surfsense_backend
+pnpm install
+pnpm exec playwright install --with-deps chromium
+```
+
+### Each run
+
+**1. Bring up Postgres + Redis** from the repo root (the other deps-only
+services (SearXNG, Zero, pgAdmin) are not needed for E2E):
+
+```bash
+docker compose -f docker/docker-compose.deps-only.yml up -d db redis
+```
+
+**2. Start the backend** in `surfsense_backend/`, terminal A:
+
+```bash
+uv sync
+uv run alembic upgrade head
 uv run python tests/e2e/run_backend.py
-# in a second shell:
+```
+
+**3. Start the Celery worker** in `surfsense_backend/`, terminal B:
+
+```bash
 uv run python tests/e2e/run_celery.py
 ```

-Then in `surfsense_web`:
+**4. Register the Playwright user**:

 ```bash
-pnpm test:e2e
+curl -X POST http://localhost:8000/auth/register \
+  -H "Content-Type: application/json" \
+  -d '{"email":"e2e-test@surfsense.net","password":"E2eTestPassword123!"}'
 ```
+
+**5. Run Playwright** from `surfsense_web/`, terminal C:
+
+```bash
+pnpm test:e2e             # dev server (fast iteration)
+pnpm test:e2e:headed      # show the browser
+pnpm test:e2e:ui          # Playwright UI mode
+pnpm test:e2e:prod        # build + start (matches CI exactly)
+```
+
+`playwright.config.ts` and the run scripts share defaults, so this works on a
+fresh checkout. Set `PLAYWRIGHT_TEST_EMAIL`, `PLAYWRIGHT_TEST_PASSWORD`,
+`NEXT_PUBLIC_FASTAPI_BACKEND_URL`, or any backend env (e.g. `DATABASE_URL`)
+only when pointing tests at a different stack.
+
+### Cleanup
+
+```bash
+docker compose -f docker/docker-compose.deps-only.yml down
+```
+
+Add `-v` to also wipe the Postgres volume.
+
+### Hermetic alternative (matches CI)
+
+To reproduce the CI environment exactly — backend and Celery in containers,
+network egress denied at L3 — replace steps 1–3 with:
+
+```bash
+docker compose -f docker/docker-compose.e2e.yml up -d --build --wait
+```
+
+Then run steps 4 (curl register) and 5 (`pnpm test:e2e:prod`) as above. Tear
+down with:
+
+```bash
+docker compose -f docker/docker-compose.e2e.yml down -v --remove-orphans
+```
+
+This builds the ~9 GB `surfsense-e2e-backend:local` image, so the deps-only
+flow above is faster for day-to-day development.
--- a/surfsense_backend/tests/e2e/auth_mint.py
+++ b/surfsense_backend/tests/e2e/auth_mint.py
@ -0,0 +1,66 @@
+"""Test-only token mint endpoint for the E2E backend entrypoint.
+
+Mounted by ``tests/e2e/run_backend.py`` so Playwright can authenticate
+the seeded e2e user without hitting ``/auth/jwt/login`` (rate-limited
+to 5/min/IP in production). NEVER ships to production: this whole
+``tests/`` tree is excluded from the production Docker image by
+``surfsense_backend/.dockerignore``.
+
+Authn: shared secret in ``X-E2E-Mint-Secret``. Same value is set on the
+backend container env (``docker/docker-compose.e2e.yml``) and exported
+to the Playwright runner (``.github/workflows/e2e-tests.yml``).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+from fastapi import APIRouter, FastAPI, Header, HTTPException
+from pydantic import BaseModel
+from sqlalchemy import select
+
+from app.db import User, async_session_maker
+from app.users import get_jwt_strategy
+
+_logger = logging.getLogger("surfsense.e2e.auth_mint")
+
+
+class MintRequest(BaseModel):
+    email: str = "e2e-test@surfsense.net"
+
+
+class MintResponse(BaseModel):
+    access_token: str
+    token_type: str = "bearer"
+
+
+def _expected_secret() -> str:
+    return os.environ.get("E2E_MINT_SECRET", "local-e2e-mint-secret-not-for-production")
+
+
+router = APIRouter(prefix="/__e2e__", tags=["__e2e__"])
+
+
+@router.post("/auth/token", response_model=MintResponse)
+async def mint_test_token(
+    body: MintRequest,
+    x_e2e_mint_secret: str = Header(..., alias="X-E2E-Mint-Secret"),
+) -> MintResponse:
+    if x_e2e_mint_secret != _expected_secret():
+        raise HTTPException(status_code=403, detail="invalid e2e mint secret")
+    async with async_session_maker() as session:
+        result = await session.execute(select(User).where(User.email == body.email))
+        user = result.scalar_one_or_none()
+    if user is None:
+        raise HTTPException(
+            status_code=404, detail=f"e2e user {body.email!r} not seeded"
+        )
+    token = await get_jwt_strategy().write_token(user)
+    return MintResponse(access_token=token)
+
+
+def install(app: FastAPI) -> None:
+    """Mount the test-only mint router onto the given FastAPI app."""
+    app.include_router(router)
+    _logger.warning("[e2e] mounted POST /__e2e__/auth/token (test-only token mint)")
--- a/surfsense_backend/tests/e2e/fakes/docling_service.py
+++ b/surfsense_backend/tests/e2e/fakes/docling_service.py
@ -0,0 +1,141 @@
+"""Stub DoclingService.process_document for E2E.
+
+The real ``DoclingService.process_document`` calls
+``DocumentConverter.convert(file_path)`` which lazily downloads the
+``docling-project/docling-layout-heron`` model from Hugging Face Hub.
+The hermetic E2E container sets ``HF_HUB_OFFLINE=1`` (see
+``docker/docker-compose.e2e.yml``), so that download fails with
+``LocalEntryNotFoundError`` and the indexing Celery task retries until
+the Playwright test hits its ~4-minute step timeout. In CI that is the
+difference between the suite finishing and the 30-minute job timeout
+killing the run before any report can upload.
+
+Stubbing ``process_document`` bypasses ``DocumentConverter.convert()``
+entirely. ``DoclingService.__init__`` is intentionally left untouched
+because constructing ``DocumentConverter(...)`` is cheap and offline —
+it is only ``.convert()`` that triggers the offline-model download.
+
+Every canary PDF under ``tests/e2e/fakes/fixtures/binary/`` is produced
+by ``generate_canary_pdfs.py`` and embeds its canary token as plain
+``(text) Tj`` PDF text operators. Extracting those operators gives us
+the canary string back, which is what the Playwright assertions look
+for in the resulting Document row.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Matches the `(escaped text) Tj` text-show operator emitted by
+# generate_canary_pdfs.py. Inside the parens, the escape rules are:
+#   \\  -> backslash
+#   \(  -> literal (
+#   \)  -> literal )
+# The character class [^\\()] consumes any non-escape byte; \\. consumes
+# an escape sequence. Sufficient for our synthetic fixtures.
+_TJ_PATTERN = re.compile(rb"\(((?:[^\\()]|\\.)*)\)\s*Tj")
+
+
+def _extract_text_from_synthetic_pdf(file_path: str) -> str:
+    """Pull every ``(text) Tj`` payload out of a fixture PDF in order.
+
+    Returns an empty string if the file cannot be read. We do not try to
+    handle arbitrary PDFs because the fake is only ever invoked against
+    fixtures we generate ourselves.
+    """
+    try:
+        data = Path(file_path).read_bytes()
+    except OSError as exc:
+        logger.warning("[fake-docling] could not read %s: %s", file_path, exc)
+        return ""
+
+    lines: list[str] = []
+    for match in _TJ_PATTERN.finditer(data):
+        raw = match.group(1)
+        # Order-sensitive unescape via sentinel: protect `\\` first so
+        # the subsequent `\(` / `\)` passes do not corrupt it.
+        text = (
+            raw.replace(rb"\\", b"\x00")
+            .replace(rb"\(", b"(")
+            .replace(rb"\)", b")")
+            .replace(b"\x00", b"\\")
+        )
+        try:
+            lines.append(text.decode("utf-8"))
+        except UnicodeDecodeError:
+            lines.append(text.decode("latin-1"))
+    return "\n".join(lines)
+
+
+async def fake_process_document(
+    self,
+    file_path: str,
+    filename: str | None = None,
+) -> dict[str, Any]:
+    """Drop-in replacement for ``DoclingService.process_document``.
+
+    Returns the same dict shape as the production method so callers
+    (``app/etl_pipeline/parsers/docling.py``) can keep reading
+    ``result["content"]`` without changes.
+    """
+    extracted = _extract_text_from_synthetic_pdf(file_path)
+    display_name = filename or Path(file_path).name
+
+    if extracted:
+        content = f"# {display_name}\n\n{extracted}\n"
+    else:
+        # Empty fallback so the indexing pipeline does not error out on
+        # an unexpected payload. A failing canary assertion is a much
+        # clearer failure mode than a hard parser exception.
+        content = (
+            f"# {display_name}\n\n(empty docling fake — no text-show operators found)\n"
+        )
+
+    logger.info(
+        "[fake-docling] returning %d chars for %s",
+        len(content),
+        display_name,
+    )
+
+    return {
+        "content": content,
+        "full_text": content,
+        "service_used": "docling-fake",
+        "status": "success",
+        "processing_notes": "e2e fake DoclingService — no real PDF parsing",
+    }
+
+
+def install(patches: list[Any]) -> None:
+    """Patch ``DoclingService.process_document`` at the class level.
+
+    Patching the class method (rather than each call site) is correct
+    here because every consumer goes through
+    ``create_docling_service()`` → ``DoclingService()`` → instance method
+    dispatch, so the descriptor protocol picks up our replacement. There
+    is exactly one such consumer today
+    (``app/etl_pipeline/parsers/docling.py``), but patching the class is
+    future-proof.
+
+    Fails loud rather than warning, because a silent passthrough means
+    real Docling + ``HF_HUB_OFFLINE=1`` = 4 minutes of CI hang per test.
+    """
+    from unittest.mock import patch as _patch
+
+    target = "app.services.docling_service.DoclingService.process_document"
+    try:
+        p = _patch(target, fake_process_document)
+        p.start()
+        patches.append(p)
+        logger.info("[fake-docling] patched %s", target)
+    except (ModuleNotFoundError, AttributeError) as exc:
+        raise RuntimeError(
+            f"Could not patch Docling binding {target!r}: {exc!s}. "
+            f"Update surfsense_backend/tests/e2e/fakes/docling_service.py "
+            f"to point at the new binding site."
+        ) from exc
--- a/surfsense_backend/tests/e2e/fixtures/global_llm_config.yaml
+++ b/surfsense_backend/tests/e2e/fixtures/global_llm_config.yaml
@ -0,0 +1,71 @@
+# Synthetic Global LLM configuration for E2E ONLY.
+#
+# Why this file exists:
+#   surfsense_backend/app/config/global_llm_config.yaml is gitignored
+#   (operators ship real API keys there). In CI that file does not exist,
+#   so app.config.load_global_llm_configs() returns [], every chat-stream
+#   test fails fast with "No usable global LLM configs are available for
+#   Auto mode" raised by auto_model_pin_service._global_candidates().
+#
+# What this file does:
+#   tests/e2e/run_backend.py and tests/e2e/run_celery.py copy this file
+#   to app/config/global_llm_config.yaml at startup, BEFORE app.config
+#   is imported. The copy lives only inside the E2E Docker container.
+#
+# Why a fake api_key is safe:
+#   tests.e2e.fakes.chat_llm patches
+#     app.tasks.chat.stream_new_chat.create_chat_litellm_from_agent_config
+#     app.tasks.chat.stream_new_chat.create_chat_litellm_from_config
+#   so the resolved auto-pin id is never sent to a real LLM provider.
+#   The values below only need to pass
+#   auto_model_pin_service._is_usable_global_config()
+#   which requires id / model_name / provider / api_key all truthy.
+#
+# Why TWO entries (premium + free):
+#   auto_model_pin_service.resolve_or_get_pinned_llm_config_id() splits
+#   candidates by billing_tier based on _is_premium_eligible(user):
+#     premium_eligible == True  -> keeps only tier=="premium" configs
+#     premium_eligible == False -> keeps only tier!="premium" configs
+#   A single-tier fixture would fail one of the two branches with
+#   "Auto mode could not find an eligible LLM config for this user and
+#   quota state". Shipping one of each guarantees every quota state
+#   resolves to a viable pin in E2E.
+
+router_settings:
+  routing_strategy: "simple-shuffle"
+  num_retries: 0
+  allowed_fails: 1
+  cooldown_time: 1
+
+global_llm_configs:
+  - id: -9001
+    name: "E2E Fake Auto Model (premium)"
+    billing_tier: "premium"
+    anonymous_enabled: false
+    seo_enabled: false
+    quality_score: 1.0
+    provider: "OPENAI"
+    model_name: "fake-e2e-model-premium"
+    api_key: "fake-e2e-api-key-not-for-production"
+    supports_image_input: false
+    quota_reserve_tokens: 1024
+    rpm: 1000
+    tpm: 100000
+    litellm_params:
+      model: "openai/fake-e2e-model-premium"
+
+  - id: -9002
+    name: "E2E Fake Auto Model (free)"
+    billing_tier: "free"
+    anonymous_enabled: false
+    seo_enabled: false
+    quality_score: 1.0
+    provider: "OPENAI"
+    model_name: "fake-e2e-model-free"
+    api_key: "fake-e2e-api-key-not-for-production"
+    supports_image_input: false
+    quota_reserve_tokens: 1024
+    rpm: 1000
+    tpm: 100000
+    litellm_params:
+      model: "openai/fake-e2e-model-free"
--- a/surfsense_backend/tests/e2e/run_backend.py
+++ b/surfsense_backend/tests/e2e/run_backend.py
@ -23,15 +23,12 @@ Usage:

 from __future__ import annotations

+import asyncio
 import logging
 import os
 import sys

-# ---------------------------------------------------------------------------
-# 1) Hijack sys.modules BEFORE any production import.
-#    Production: composio_service.py:11 does `from composio import Composio`.
-#    With this hijack in place, that import resolves to our strict fake.
-# ---------------------------------------------------------------------------
+import uvicorn

 # Make the surfsense_backend root importable as a top-level package so
 # `import tests.e2e.fakes...` works regardless of how the entrypoint is
@ -42,97 +39,175 @@ _BACKEND_ROOT = os.path.abspath(os.path.join(_THIS_DIR, "..", ".."))
 if _BACKEND_ROOT not in sys.path:
    sys.path.insert(0, _BACKEND_ROOT)

-import tests.e2e.fakes.composio_module as _fake_composio  # noqa: E402
-import tests.e2e.fakes.notion_module as _fake_notion  # noqa: E402

-sys.modules["composio"] = _fake_composio
-sys.modules["notion_client"] = _fake_notion
-sys.modules["notion_client.errors"] = _fake_notion.errors
-
-
-# ---------------------------------------------------------------------------
-# 2) Standard logging + dotenv so the rest of the app behaves like main.py.
-# ---------------------------------------------------------------------------
-
-from dotenv import load_dotenv  # noqa: E402
-
-load_dotenv()
-os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
-os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
-os.environ.setdefault(
-    "CONFLUENCE_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/confluence/connector/callback",
-)
-os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
-os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
-os.environ.setdefault(
-    "NOTION_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/notion/connector/callback",
-)
-os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
-os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
-os.environ.setdefault(
-    "ONEDRIVE_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
-)
-os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
-os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
-os.environ.setdefault(
-    "DROPBOX_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
-)
-os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
-os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
 logger = logging.getLogger("surfsense.e2e.backend")
-logger.warning(
-    "*** SURFSENSE E2E BACKEND ENTRYPOINT — fake Composio + LLM + embeddings ***"
-)
-
-
-# ---------------------------------------------------------------------------
-# 3) Now import the production app. Every module in app.* loads here,
-#    creating their bindings (some of which we will patch in step 4).
-# ---------------------------------------------------------------------------
-
-# ---------------------------------------------------------------------------
-# 4) Patch LLM + embedding bindings at every consumer site.
-#    Composio is already covered by the sys.modules hijack in step 1.
-# ---------------------------------------------------------------------------
-from unittest.mock import patch  # noqa: E402
-
-from app.app import app  # noqa: E402
-from tests.e2e.fakes import (  # noqa: E402
-    clickup_module as _fake_clickup_module,
-    confluence_indexer as _fake_confluence_indexer,
-    confluence_oauth as _fake_confluence_oauth,
-    dropbox_api as _fake_dropbox_api,
-    embeddings as _fake_embeddings,
-    jira_module as _fake_jira_module,
-    linear_module as _fake_linear_module,
-    mcp_oauth_runtime as _fake_mcp_oauth_runtime,
-    mcp_runtime as _fake_mcp_runtime,
-    native_google as _fake_native_google,
-    notion_module as _fake_notion_module,
-    onedrive_graph as _fake_onedrive_graph,
-    slack_module as _fake_slack_module,
-)
-from tests.e2e.fakes.chat_llm import (  # noqa: E402
-    fake_create_chat_litellm_from_agent_config,
-    fake_create_chat_litellm_from_config,
-)
-from tests.e2e.fakes.llm import fake_get_user_long_context_llm  # noqa: E402

+# Patches started during bootstrap are kept alive for the lifetime of the
+# process. We never call .stop() on them.
 _active_patches: list = []


+def _hijack_external_sdks() -> None:
+    """Replace composio + notion_client in sys.modules.
+
+    Production does ``from composio import Composio`` and
+    ``import notion_client`` at import time. With this hijack in place,
+    those imports resolve to our strict fakes.
+
+    MUST run before _import_production_app().
+    """
+    import tests.e2e.fakes.composio_module as _fake_composio
+    import tests.e2e.fakes.notion_module as _fake_notion
+
+    sys.modules["composio"] = _fake_composio
+    sys.modules["notion_client"] = _fake_notion
+    sys.modules["notion_client.errors"] = _fake_notion.errors
+
+
+def _load_dotenv_and_set_env_defaults() -> None:
+    """Load .env and set every env var the production config reads on import.
+
+    MUST run before _import_production_app(), since app.config consumes
+    these values at import time.
+    """
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    os.environ.setdefault(
+        "DATABASE_URL",
+        "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense",
+    )
+    os.environ.setdefault("CELERY_BROKER_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
+    os.environ.setdefault("REDIS_APP_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
+    os.environ.setdefault("SECRET_KEY", "local-e2e-secret-not-for-production")
+    os.environ.setdefault("AUTH_TYPE", "LOCAL")
+    os.environ.setdefault("REGISTRATION_ENABLED", "TRUE")
+    os.environ.setdefault("ETL_SERVICE", "DOCLING")
+    os.environ.setdefault("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    os.environ.setdefault("NEXT_FRONTEND_URL", "http://localhost:3000")
+
+    # Sentinel keys — fakes never read them; turns leaked real calls into 401s.
+    os.environ.setdefault("COMPOSIO_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("COMPOSIO_ENABLED", "TRUE")
+    os.environ.setdefault("OPENAI_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("ANTHROPIC_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("LITELLM_API_KEY", "local-deny-real-call-sentinel")
+
+    os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
+    os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
+    os.environ.setdefault(
+        "CONFLUENCE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/confluence/connector/callback",
+    )
+    os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
+    os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
+    os.environ.setdefault(
+        "NOTION_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/notion/connector/callback",
+    )
+    os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
+    os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
+    os.environ.setdefault(
+        "ONEDRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
+    )
+    os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
+    os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
+    os.environ.setdefault(
+        "DROPBOX_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
+    )
+    # Native Google OAuth — fake Flow in tests.e2e.fakes.native_google
+    # raises "Fake Google Flow requires redirect_uri." if these are empty,
+    # so connector/add routes return 500 in CI where no .env supplies them.
+    os.environ.setdefault(
+        "GOOGLE_DRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/drive/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_GMAIL_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/gmail/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_CALENDAR_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/calendar/connector/callback",
+    )
+    os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
+    os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
+
+
+def _install_synthetic_global_llm_config() -> None:
+    """Materialise a fake ``app/config/global_llm_config.yaml`` for E2E.
+
+    The real file is gitignored (production operators ship their own with
+    real API keys), so a fresh CI checkout has no YAML at the path
+    ``app.config.load_global_llm_configs()`` reads. With an empty
+    ``GLOBAL_LLM_CONFIGS`` list, ``auto_model_pin_service`` raises
+    ``"No usable global LLM configs are available for Auto mode"`` on
+    every chat-stream request.
+
+    We copy the synthetic fixture from ``tests/e2e/fixtures/`` into the
+    production-expected location BEFORE ``_import_production_app()`` so
+    ``app.config`` picks it up on import. Production code is untouched —
+    this is purely a test-time scaffold.
+
+    Only installs when the destination is missing. A developer running
+    the E2E entrypoint locally keeps their real ``global_llm_config.yaml``
+    intact (the patched ``create_chat_litellm_from_*`` factories make the
+    actual model values irrelevant either way).
+
+    MUST run before _import_production_app().
+    """
+    import shutil
+
+    src = os.path.join(_THIS_DIR, "fixtures", "global_llm_config.yaml")
+    dst = os.path.join(_BACKEND_ROOT, "app", "config", "global_llm_config.yaml")
+
+    if not os.path.exists(src):
+        raise RuntimeError(
+            f"E2E synthetic global LLM config fixture missing at {src!r}. "
+            f"This file is checked into tests/e2e/fixtures/ — if it has gone "
+            f"missing, restore it from VCS before running the E2E entrypoint."
+        )
+
+    if os.path.exists(dst):
+        logger.info(
+            "[e2e-global-llm-config] %s already exists; leaving it alone "
+            "(local dev config preserved)",
+            dst,
+        )
+        return
+
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    shutil.copyfile(src, dst)
+    logger.info("[e2e-global-llm-config] installed %s -> %s", src, dst)
+
+
+def _import_production_app():
+    """Import and return the production FastAPI app.
+
+    Every module under ``app.*`` loads here, creating their bindings.
+    The LLM/embedding factories captured at this point will be replaced
+    by patches in _patch_llm_bindings() below.
+    """
+    from app.app import app as production_app
+
+    return production_app
+
+
 def _patch_llm_bindings() -> None:
    """Replace LLM factories at every known binding site."""
+    from unittest.mock import patch
+
+    from tests.e2e.fakes.chat_llm import (
+        fake_create_chat_litellm_from_agent_config,
+        fake_create_chat_litellm_from_config,
+    )
+    from tests.e2e.fakes.llm import fake_get_user_long_context_llm
+
    targets = [
        "app.services.llm_service.get_user_long_context_llm",
        "app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
@ -190,38 +265,90 @@ def _patch_llm_bindings() -> None:
            logger.warning("[fake-chat-llm] could not patch %s: %s.", target, exc)


-_patch_llm_bindings()
-_fake_embeddings.install(_active_patches)
-_fake_confluence_oauth.install(_active_patches)
-_fake_confluence_indexer.install(_active_patches)
-_fake_native_google.install(_active_patches)
-_fake_onedrive_graph.install(_active_patches)
-_fake_dropbox_api.install(_active_patches)
-_fake_notion_module.install(_active_patches)
-_fake_linear_module.install(_active_patches)
-_fake_jira_module.install(_active_patches)
-_fake_clickup_module.install(_active_patches)
-_fake_mcp_runtime.install(_active_patches)
-_fake_mcp_oauth_runtime.install(_active_patches)
-_fake_slack_module.install(_active_patches)
+def _install_runtime_fakes() -> None:
+    """Run each fake's install() against the active patch stack."""
+    from tests.e2e.fakes import (
+        clickup_module as _fake_clickup_module,
+        confluence_indexer as _fake_confluence_indexer,
+        confluence_oauth as _fake_confluence_oauth,
+        docling_service as _fake_docling_service,
+        dropbox_api as _fake_dropbox_api,
+        embeddings as _fake_embeddings,
+        jira_module as _fake_jira_module,
+        linear_module as _fake_linear_module,
+        mcp_oauth_runtime as _fake_mcp_oauth_runtime,
+        mcp_runtime as _fake_mcp_runtime,
+        native_google as _fake_native_google,
+        notion_module as _fake_notion_module,
+        onedrive_graph as _fake_onedrive_graph,
+        slack_module as _fake_slack_module,
+    )
+
+    _fake_embeddings.install(_active_patches)
+    _fake_docling_service.install(_active_patches)
+    _fake_confluence_oauth.install(_active_patches)
+    _fake_confluence_indexer.install(_active_patches)
+    _fake_native_google.install(_active_patches)
+    _fake_onedrive_graph.install(_active_patches)
+    _fake_dropbox_api.install(_active_patches)
+    _fake_notion_module.install(_active_patches)
+    _fake_linear_module.install(_active_patches)
+    _fake_jira_module.install(_active_patches)
+    _fake_clickup_module.install(_active_patches)
+    _fake_mcp_runtime.install(_active_patches)
+    _fake_mcp_oauth_runtime.install(_active_patches)
+    _fake_slack_module.install(_active_patches)


-# ---------------------------------------------------------------------------
-# 5) Mount test-only middleware. Production never reaches this code.
-# ---------------------------------------------------------------------------
+def _install_test_only_app_extensions(app) -> None:
+    """Mount test-only middleware + the /__e2e__ token mint router.

-from tests.e2e.middleware.scenario import ScenarioMiddleware  # noqa: E402
+    POST /__e2e__/auth/token bypasses /auth/jwt/login's 5/min/IP rate
+    limit so Playwright workers can authenticate without thrashing the
+    production auth surface. See tests/e2e/auth_mint.py.
+    """
+    from tests.e2e.auth_mint import install as install_e2e_mint
+    from tests.e2e.middleware.scenario import ScenarioMiddleware

-app.add_middleware(ScenarioMiddleware)
+    app.add_middleware(ScenarioMiddleware)
+    install_e2e_mint(app)


-# ---------------------------------------------------------------------------
-# 6) Start uvicorn, mirroring main.py's behaviour.
-# ---------------------------------------------------------------------------
+def _bootstrap():
+    """Run the full E2E bootstrap and return the production FastAPI app.

-import asyncio  # noqa: E402
+    Ordering is load-bearing:
+      1) Hijack composio + notion_client in sys.modules.
+      2) Load .env + set env defaults (app.config reads env on import).
+      3) Configure logging.
+      4) Materialise the synthetic global_llm_config.yaml so Auto-mode
+         pin resolution finds at least one usable candidate.
+      5) Import production app (which transitively imports the now-faked
+         external SDKs and reads the env defaults + YAML).
+      6) Patch LLM / embedding bindings at every consumer site.
+      7) Mount test-only middleware + /__e2e__ routes onto the app.
+    """
+    _hijack_external_sdks()
+    _load_dotenv_and_set_env_defaults()

-import uvicorn  # noqa: E402
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logger.warning(
+        "*** SURFSENSE E2E BACKEND ENTRYPOINT — fake Composio + LLM + embeddings ***"
+    )
+
+    _install_synthetic_global_llm_config()
+    production_app = _import_production_app()
+    _patch_llm_bindings()
+    _install_runtime_fakes()
+    _install_test_only_app_extensions(production_app)
+    return production_app
+
+
+app = _bootstrap()


 def _main() -> None:
--- a/surfsense_backend/tests/e2e/run_celery.py
+++ b/surfsense_backend/tests/e2e/run_celery.py
@ -25,96 +25,166 @@ if _BACKEND_ROOT not in sys.path:
    sys.path.insert(0, _BACKEND_ROOT)


-# ---------------------------------------------------------------------------
-# 1) Hijack sys.modules BEFORE production celery imports anything.
-# ---------------------------------------------------------------------------
-
-import tests.e2e.fakes.composio_module as _fake_composio  # noqa: E402
-import tests.e2e.fakes.notion_module as _fake_notion  # noqa: E402
-
-sys.modules["composio"] = _fake_composio
-sys.modules["notion_client"] = _fake_notion
-sys.modules["notion_client.errors"] = _fake_notion.errors
-
-
-# ---------------------------------------------------------------------------
-# 2) Logging + dotenv.
-# ---------------------------------------------------------------------------
-
-from dotenv import load_dotenv  # noqa: E402
-
-load_dotenv()
-os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
-os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
-os.environ.setdefault(
-    "CONFLUENCE_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/confluence/connector/callback",
-)
-os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
-os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
-os.environ.setdefault(
-    "NOTION_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/notion/connector/callback",
-)
-os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
-os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
-os.environ.setdefault(
-    "ONEDRIVE_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
-)
-os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
-os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
-os.environ.setdefault(
-    "DROPBOX_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
-)
-os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
-os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
 logger = logging.getLogger("surfsense.e2e.celery")
-logger.warning("*** SURFSENSE E2E CELERY WORKER — fake Composio + LLM + embeddings ***")
-
-
-# ---------------------------------------------------------------------------
-# 3) Import the production celery_app. All task modules load here.
-# ---------------------------------------------------------------------------
-
-# ---------------------------------------------------------------------------
-# 4) Patch LLM + embedding bindings inside the worker process.
-# ---------------------------------------------------------------------------
-from unittest.mock import patch  # noqa: E402
-
-from app.celery_app import celery_app  # noqa: E402
-from tests.e2e.fakes import (  # noqa: E402
-    clickup_module as _fake_clickup_module,
-    confluence_indexer as _fake_confluence_indexer,
-    confluence_oauth as _fake_confluence_oauth,
-    dropbox_api as _fake_dropbox_api,
-    embeddings as _fake_embeddings,
-    jira_module as _fake_jira_module,
-    linear_module as _fake_linear_module,
-    mcp_oauth_runtime as _fake_mcp_oauth_runtime,
-    mcp_runtime as _fake_mcp_runtime,
-    native_google as _fake_native_google,
-    notion_module as _fake_notion_module,
-    onedrive_graph as _fake_onedrive_graph,
-    slack_module as _fake_slack_module,
-)
-from tests.e2e.fakes.chat_llm import (  # noqa: E402
-    fake_create_chat_litellm_from_agent_config,
-    fake_create_chat_litellm_from_config,
-)
-from tests.e2e.fakes.llm import fake_get_user_long_context_llm  # noqa: E402

+# Patches started during bootstrap are kept alive for the lifetime of the
+# process. We never call .stop() on them.
 _active_patches: list = []


+def _hijack_external_sdks() -> None:
+    """Replace composio + notion_client in sys.modules.
+
+    Production does ``from composio import Composio`` and
+    ``import notion_client`` at import time. With this hijack in place,
+    those imports resolve to our strict fakes.
+
+    MUST run before _import_celery_app().
+    """
+    import tests.e2e.fakes.composio_module as _fake_composio
+    import tests.e2e.fakes.notion_module as _fake_notion
+
+    sys.modules["composio"] = _fake_composio
+    sys.modules["notion_client"] = _fake_notion
+    sys.modules["notion_client.errors"] = _fake_notion.errors
+
+
+def _load_dotenv_and_set_env_defaults() -> None:
+    """Load .env and set every env var the production config reads on import.
+
+    MUST run before _import_celery_app(), since app.config consumes
+    these values at import time.
+    """
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    os.environ.setdefault(
+        "DATABASE_URL",
+        "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense",
+    )
+    os.environ.setdefault("CELERY_BROKER_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
+    os.environ.setdefault("REDIS_APP_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
+    os.environ.setdefault("SECRET_KEY", "local-e2e-secret-not-for-production")
+    os.environ.setdefault("AUTH_TYPE", "LOCAL")
+    os.environ.setdefault("REGISTRATION_ENABLED", "TRUE")
+    os.environ.setdefault("ETL_SERVICE", "DOCLING")
+    os.environ.setdefault("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    os.environ.setdefault("NEXT_FRONTEND_URL", "http://localhost:3000")
+
+    # Sentinel keys — fakes never read them; turns leaked real calls into 401s.
+    os.environ.setdefault("COMPOSIO_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("COMPOSIO_ENABLED", "TRUE")
+    os.environ.setdefault("OPENAI_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("ANTHROPIC_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("LITELLM_API_KEY", "local-deny-real-call-sentinel")
+
+    os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
+    os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
+    os.environ.setdefault(
+        "CONFLUENCE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/confluence/connector/callback",
+    )
+    os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
+    os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
+    os.environ.setdefault(
+        "NOTION_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/notion/connector/callback",
+    )
+    os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
+    os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
+    os.environ.setdefault(
+        "ONEDRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
+    )
+    os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
+    os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
+    os.environ.setdefault(
+        "DROPBOX_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
+    )
+    # Native Google OAuth — fake Flow in tests.e2e.fakes.native_google raises
+    # "Fake Google Flow requires redirect_uri." when these are empty.
+    os.environ.setdefault(
+        "GOOGLE_DRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/drive/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_GMAIL_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/gmail/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_CALENDAR_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/calendar/connector/callback",
+    )
+    os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
+    os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
+
+
+def _install_synthetic_global_llm_config() -> None:
+    """Materialise a fake ``app/config/global_llm_config.yaml`` for E2E.
+
+    The real file is gitignored (production operators ship their own with
+    real API keys), so a fresh CI checkout has no YAML at the path
+    ``app.config.load_global_llm_configs()`` reads. With an empty
+    ``GLOBAL_LLM_CONFIGS`` list, the worker's view of the config diverges
+    from the API container.
+
+    We copy the synthetic fixture from ``tests/e2e/fixtures/`` into the
+    production-expected location BEFORE _import_celery_app() so
+    ``app.config`` picks it up on import. Install-only-if-missing so a
+    developer's local config (with real API keys) is preserved.
+
+    MUST run before _import_celery_app().
+    """
+    import shutil
+
+    src = os.path.join(_THIS_DIR, "fixtures", "global_llm_config.yaml")
+    dst = os.path.join(_BACKEND_ROOT, "app", "config", "global_llm_config.yaml")
+
+    if not os.path.exists(src):
+        raise RuntimeError(
+            f"E2E synthetic global LLM config fixture missing at {src!r}. "
+            f"Restore tests/e2e/fixtures/global_llm_config.yaml from VCS."
+        )
+
+    if os.path.exists(dst):
+        logger.info(
+            "[e2e-global-llm-config] %s already exists; leaving it alone "
+            "(local dev config preserved)",
+            dst,
+        )
+        return
+
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    shutil.copyfile(src, dst)
+    logger.info("[e2e-global-llm-config] installed %s -> %s", src, dst)
+
+
+def _import_celery_app():
+    """Import and return the production Celery app.
+
+    Every module under ``app.*`` (including all task modules) loads here,
+    creating their bindings. The LLM/embedding factories captured at this
+    point will be replaced by patches in _patch_llm_bindings() below.
+    """
+    from app.celery_app import celery_app
+
+    return celery_app
+
+
 def _patch_llm_bindings() -> None:
+    """Replace LLM factories at every known binding site in worker tasks."""
+    from unittest.mock import patch
+
+    from tests.e2e.fakes.chat_llm import (
+        fake_create_chat_litellm_from_agent_config,
+        fake_create_chat_litellm_from_config,
+    )
+    from tests.e2e.fakes.llm import fake_get_user_long_context_llm
+
    targets = [
        "app.services.llm_service.get_user_long_context_llm",
        "app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
@ -172,38 +242,93 @@ def _patch_llm_bindings() -> None:
            )


-_patch_llm_bindings()
-_fake_embeddings.install(_active_patches)
-_fake_confluence_oauth.install(_active_patches)
-_fake_confluence_indexer.install(_active_patches)
-_fake_native_google.install(_active_patches)
-_fake_onedrive_graph.install(_active_patches)
-_fake_dropbox_api.install(_active_patches)
-_fake_notion_module.install(_active_patches)
-_fake_linear_module.install(_active_patches)
-_fake_jira_module.install(_active_patches)
-_fake_clickup_module.install(_active_patches)
-_fake_mcp_runtime.install(_active_patches)
-_fake_mcp_oauth_runtime.install(_active_patches)
-_fake_slack_module.install(_active_patches)
+def _install_runtime_fakes() -> None:
+    """Run each fake's install() against the active patch stack."""
+    from tests.e2e.fakes import (
+        clickup_module as _fake_clickup_module,
+        confluence_indexer as _fake_confluence_indexer,
+        confluence_oauth as _fake_confluence_oauth,
+        docling_service as _fake_docling_service,
+        dropbox_api as _fake_dropbox_api,
+        embeddings as _fake_embeddings,
+        jira_module as _fake_jira_module,
+        linear_module as _fake_linear_module,
+        mcp_oauth_runtime as _fake_mcp_oauth_runtime,
+        mcp_runtime as _fake_mcp_runtime,
+        native_google as _fake_native_google,
+        notion_module as _fake_notion_module,
+        onedrive_graph as _fake_onedrive_graph,
+        slack_module as _fake_slack_module,
+    )
+
+    _fake_embeddings.install(_active_patches)
+    _fake_docling_service.install(_active_patches)
+    _fake_confluence_oauth.install(_active_patches)
+    _fake_confluence_indexer.install(_active_patches)
+    _fake_native_google.install(_active_patches)
+    _fake_onedrive_graph.install(_active_patches)
+    _fake_dropbox_api.install(_active_patches)
+    _fake_notion_module.install(_active_patches)
+    _fake_linear_module.install(_active_patches)
+    _fake_jira_module.install(_active_patches)
+    _fake_clickup_module.install(_active_patches)
+    _fake_mcp_runtime.install(_active_patches)
+    _fake_mcp_oauth_runtime.install(_active_patches)
+    _fake_slack_module.install(_active_patches)


-# ---------------------------------------------------------------------------
-# 5) Start the worker.
-# ---------------------------------------------------------------------------
+def _bootstrap():
+    """Run the full E2E bootstrap and return the production Celery app.
+
+    Ordering is load-bearing:
+      1) Hijack composio + notion_client in sys.modules.
+      2) Load .env + set env defaults (app.config reads env on import).
+      3) Configure logging.
+      4) Materialise the synthetic global_llm_config.yaml so the worker's
+         view of GLOBAL_LLM_CONFIGS matches the API container.
+      5) Import production celery_app (which transitively imports the
+         now-faked external SDKs and reads the env defaults + YAML).
+      6) Patch LLM / embedding bindings at every consumer site.
+      7) Install runtime fakes for connectors and chat backends.
+    """
+    _hijack_external_sdks()
+    _load_dotenv_and_set_env_defaults()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logger.warning(
+        "*** SURFSENSE E2E CELERY WORKER — fake Composio + LLM + embeddings ***"
+    )
+
+    _install_synthetic_global_llm_config()
+    celery_app = _import_celery_app()
+    _patch_llm_bindings()
+    _install_runtime_fakes()
+    return celery_app
+
+
+celery_app = _bootstrap()


 def _main() -> None:
-    # Default queues mirror production (default queue + connectors queue
-    # so Drive indexing tasks are picked up).
    queue_name = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
    queues = f"{queue_name},{queue_name}.connectors"
+
+    # macOS forks-after-MPS-init crash prefork workers; threads avoid it.
+    default_pool = "threads" if sys.platform == "darwin" else "prefork"
+    pool = os.getenv("CELERY_POOL", default_pool)
+    concurrency = os.getenv("CELERY_CONCURRENCY", "2")
+
    celery_app.worker_main(
        argv=[
            "worker",
            "--loglevel=info",
            f"--queues={queues}",
-            "--concurrency=2",
+            f"--pool={pool}",
+            f"--concurrency={concurrency}",
            "--without-gossip",
            "--without-mingle",
        ]
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@ -741,6 +741,372 @@ async def test_extract_image_falls_back_to_document_without_vision_llm(
    assert result.content_type == "document"


+# ---------------------------------------------------------------------------
+# Document path with vision LLM: per-image descriptions are appended
+# ---------------------------------------------------------------------------
+
+
+def _fake_extraction_result(*descriptions):
+    from app.etl_pipeline.picture_describer import (
+        PictureDescription,
+        PictureExtractionResult,
+    )
+
+    return PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=d["page"],
+                ordinal_in_page=d.get("ordinal", 0),
+                name=d["name"],
+                sha256=d.get("sha", "deadbeef"),
+                description=d["desc"],
+            )
+            for d in descriptions
+        ]
+    )
+
+
+async def test_extract_pdf_with_vision_llm_inlines_image_blocks(tmp_path, mocker):
+    """A PDF with an `<!-- image -->` placeholder + caption gets the
+    block spliced inline (no orphaned ``## Image Content`` section).
+
+    This is the headline scenario for the medxpertqa benchmark: the
+    image content lives in the same chunk as the surrounding case text
+    so retrieval pulls the question, image, and answer options together.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": (
+            "# MedXpertQA-MM MM-130\n\n"
+            "## Clinical case\n\nA 44-year-old man...\n\n"
+            "<!-- image -->\nImage: MM-130-a.jpeg\n\n"
+            "## Answer choices\n\nA) ...\n"
+        )
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    extraction = _fake_extraction_result(
+        {
+            "page": 1,
+            "name": "Im0",
+            "desc": "Axial CT showing a large cystic mass.",
+        }
+    )
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=extraction),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    md = result.markdown_content
+    # The placeholder + caption are gone, replaced by a horizontal-
+    # rule-delimited section with the captioned filename.
+    assert "<!-- image -->" not in md
+    assert "Image: MM-130-a.jpeg" not in md
+    assert "**Embedded image:** `MM-130-a.jpeg`" in md
+    assert "**Visual description:**" in md
+    assert "Axial CT showing a large cystic mass." in md
+    # No OCR section -- our fake_extraction_result has no ocr_text,
+    # and the format omits the section when there's no text to show.
+    assert "**OCR text:**" not in md
+    # No raw HTML / XML tags or blockquote wrapping leak.
+    assert "<image" not in md
+    assert "> **Embedded image:**" not in md
+    # No appended section -- everything went inline.
+    assert "## Image Content" not in md
+    # Surrounding case text + answer options are preserved.
+    assert "A 44-year-old man..." in md
+    assert "## Answer choices" in md
+    assert "A) ..." in md
+
+
+async def test_extract_pdf_with_vision_llm_appends_when_no_marker(tmp_path, mocker):
+    """When parser markdown has no image markers, descriptions get appended.
+
+    This is the fallback path for parsers that drop image placeholders
+    entirely. The image content still ends up in the markdown -- just
+    in a clearly-labeled section rather than inline.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": "# Parsed PDF text\n\nNo image markers anywhere.\n"
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    extraction = _fake_extraction_result(
+        {"page": 1, "name": "Im0", "desc": "An image description."}
+    )
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=extraction),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    md = result.markdown_content
+    assert "# Parsed PDF text" in md
+    assert "## Image Content (vision-LLM extracted)" in md
+    assert "**Embedded image:** `Im0`" in md
+    assert "An image description." in md
+
+
+async def test_extract_pdf_without_vision_llm_skips_picture_descriptions(
+    tmp_path, mocker
+):
+    """No vision LLM -> parser markdown returned as-is."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    describe_mock = mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(),
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF text"
+    assert "<image" not in result.markdown_content
+    describe_mock.assert_not_called()
+
+
+async def test_extract_pdf_with_vision_llm_swallows_describe_failure(
+    tmp_path, mocker
+):
+    """A pypdf or vision LLM blow-up never fails the document upload."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(side_effect=RuntimeError("pypdf exploded")),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF text"
+    assert result.etl_service == "DOCLING"
+
+
+async def test_extract_pdf_with_vision_llm_no_images_returns_parser_text(
+    tmp_path, mocker
+):
+    """Vision-LLM-enabled PDF with zero extracted images is unchanged."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Just text, no images"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    empty = _fake_extraction_result()
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=empty),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Just text, no images"
+    assert "<image" not in result.markdown_content
+
+
+# ---------------------------------------------------------------------------
+# Per-image OCR runner: wiring + behaviour
+#
+# When extracting a PDF with a vision LLM, the ETL service must ALSO
+# pass an ``ocr_runner`` to picture_describer. The runner is a closure
+# that re-feeds each extracted image through a vision-LLM-less
+# EtlPipelineService -- i.e. the same OCR engine that handles
+# standalone image uploads (Docling/Azure DI/LlamaCloud) gets a crack
+# at each embedded image, with the text attached to the inline block.
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(
+    tmp_path, mocker
+):
+    """The ETL service must wire an ocr_runner kwarg to describe_pictures."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    describe_mock = mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=_fake_extraction_result()),
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    describe_mock.assert_awaited_once()
+    _, kwargs = describe_mock.await_args
+    assert "ocr_runner" in kwargs
+    assert callable(kwargs["ocr_runner"])
+
+
+async def test_extract_pdf_ocr_runner_invokes_document_parser_on_image(
+    tmp_path, mocker
+):
+    """The OCR runner closure should re-extract each image via the parser.
+
+    We capture the runner that the ETL service passes to
+    describe_pictures, invoke it with a fake image path, and assert
+    that Docling was called with that image. This proves the closure
+    is wired to a vision-LLM-less sub-pipeline (otherwise it would
+    recurse into the vision LLM and never hit the OCR engine).
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+    image_file = tmp_path / "Im0.png"
+    image_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": "Slice 24 / 60   L   R"
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    captured: dict = {}
+
+    async def capture_runner(*args, **kwargs):
+        captured["runner"] = kwargs["ocr_runner"]
+        return _fake_extraction_result()
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=capture_runner,
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    runner = captured["runner"]
+    ocr_text = await runner(str(image_file), "Im0.png")
+
+    assert ocr_text == "Slice 24 / 60   L   R"
+    # Docling was invoked twice in total: once for the PDF, once for
+    # the image we re-fed via the runner.
+    assert fake_docling.process_document.await_count == 2
+
+
+async def test_extract_pdf_ocr_runner_returns_empty_on_unsupported_image(
+    tmp_path, mocker
+):
+    """Unsupported image format → runner returns empty string, doesn't raise.
+
+    Common case: a PDF embeds a JPEG2000 or CCITT-TIFF image that
+    Docling can't load. We don't want an unsupported format on ONE
+    embedded image to spoil the whole PDF extraction; the runner
+    should swallow the EtlUnsupportedFileError and return "" so the
+    image gets a description but no OCR tag.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+    weird_image = tmp_path / "Im0.jp2"  # JPEG2000, unlikely to be supported
+    weird_image.write_bytes(b"\x00\x00\x00\x0CjP" + b"\x00" * 50)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    captured: dict = {}
+
+    async def capture_runner(*args, **kwargs):
+        captured["runner"] = kwargs["ocr_runner"]
+        return _fake_extraction_result()
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=capture_runner,
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    runner = captured["runner"]
+    ocr_text = await runner(str(weird_image), "Im0.jp2")
+
+    assert ocr_text == ""
+
+
 # ---------------------------------------------------------------------------
 # Processing Mode enum tests
 # ---------------------------------------------------------------------------
--- a/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py
@ -0,0 +1,967 @@
+"""Unit tests for the picture_describer module.
+
+Covers:
+
+- :func:`describe_pictures` -- the PDF image walker + per-image vision
+  LLM call (structured output split into ``ocr_text`` and
+  ``description``);
+- :func:`inject_descriptions_inline` -- in-place replacement of image
+  placeholders / captions in the parser markdown;
+- :func:`merge_descriptions_into_markdown` -- the top-level helper
+  that inlines what it can and appends what it can't;
+- :func:`render_appended_section` -- the appended-fallback renderer.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.etl_pipeline.picture_describer import (
+    PictureDescription,
+    PictureExtractionResult,
+    describe_pictures,
+    inject_descriptions_inline,
+    merge_descriptions_into_markdown,
+    render_appended_section,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _make_image_obj(name: str, data: bytes):
+    """Mimic pypdf's ImageFile object shape for the bits we use."""
+    img = MagicMock()
+    img.name = name
+    img.data = data
+    return img
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: short-circuits
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_no_op_for_non_pdf(tmp_path):
+    """Non-PDF files are silently no-op'd; we don't try to extract images."""
+    docx_file = tmp_path / "report.docx"
+    docx_file.write_bytes(b"PK fake docx")
+
+    fake_llm = AsyncMock()
+    result = await describe_pictures(str(docx_file), "report.docx", fake_llm)
+
+    assert result.descriptions == []
+    assert result.skipped_too_large == 0
+    fake_llm.ainvoke.assert_not_called()
+
+
+async def test_describe_pictures_no_op_when_vision_llm_is_none(tmp_path):
+    """If the caller didn't provide a vision LLM, we no-op even for PDFs."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    result = await describe_pictures(str(pdf_file), "report.pdf", None)
+    assert result.descriptions == []
+
+
+async def test_describe_pictures_no_op_for_pdf_with_no_images(tmp_path, mocker):
+    """A PDF that pypdf can open but contains zero images returns empty."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[]), MagicMock(images=[])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    fake_llm = AsyncMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert result.descriptions == []
+    fake_llm.ainvoke.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: happy paths
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_runs_vision_llm_per_image(tmp_path, mocker):
+    """Every eligible image gets exactly one description-only vision call."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
+    page1 = MagicMock(images=[img_a])
+    page2 = MagicMock(images=[img_b])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page1, page2]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=["Description A", "Description B"]),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 2
+    by_name = {d.name: d.description for d in result.descriptions}
+    assert by_name == {"Im0.jpeg": "Description A", "Im1.png": "Description B"}
+    assert all(d.page_number in (1, 2) for d in result.descriptions)
+    assert parse_mock.await_count == 2
+
+
+async def test_describe_pictures_dedups_by_hash(tmp_path, mocker):
+    """An image that appears N times in the PDF is described once."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    payload = b"\x89PNG\r\n\x1a\n" + b"\x42" * 2000
+    img = _make_image_obj("logo.png", payload)
+    page1 = MagicMock(images=[img])
+    page2 = MagicMock(images=[_make_image_obj("logo.png", payload)])
+    page3 = MagicMock(images=[_make_image_obj("logo.png", payload)])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page1, page2, page3]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="Logo desc"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.skipped_duplicate == 2
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_skips_too_small_images(tmp_path, mocker):
+    """Sub-1KB images (tracking pixels, dots, etc.) are skipped."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    tiny = _make_image_obj("dot.png", b"\x89PNG\r\n\x1a\n")
+    big = _make_image_obj("ct.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 3000)
+    page = MagicMock(images=[tiny, big])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="CT scan"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].name == "ct.jpeg"
+    assert result.skipped_too_small == 1
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_skips_too_large_images(tmp_path, mocker):
+    """Images larger than the vision LLM's per-image cap are skipped."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    huge = _make_image_obj("huge.jpeg", b"\xff" * (6 * 1024 * 1024))
+    ok = _make_image_obj("ok.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    page = MagicMock(images=[huge, ok])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="OK image"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].name == "ok.jpeg"
+    assert result.skipped_too_large == 1
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_swallows_per_image_failure(tmp_path, mocker):
+    """A vision LLM failure on one image must not kill the whole document."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("a.jpeg", b"\xff\xd8" + b"\xab" * 2000)
+    img_b = _make_image_obj("b.jpeg", b"\xff\xd8" + b"\xcd" * 2000)
+    page = MagicMock(images=[img_a, img_b])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=[RuntimeError("vision blew up"), "Success"]),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].description == "Success"
+    assert result.failed == 1
+
+
+async def test_describe_pictures_handles_pypdf_open_failure(tmp_path, mocker):
+    """A malformed PDF that pypdf can't open returns an empty result."""
+    pdf_file = tmp_path / "broken.pdf"
+    pdf_file.write_bytes(b"not a pdf")
+
+    mocker.patch("pypdf.PdfReader", side_effect=ValueError("EOF marker not found"))
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "broken.pdf", fake_llm)
+    assert result.descriptions == []
+
+
+# ---------------------------------------------------------------------------
+# inject_descriptions_inline: replacement patterns
+# ---------------------------------------------------------------------------
+
+
+def _desc(name="Im0", description="A CT scan."):
+    return PictureDescription(
+        page_number=1,
+        ordinal_in_page=0,
+        name=name,
+        sha256="aa",
+        description=description,
+    )
+
+
+def test_inject_no_op_when_no_descriptions():
+    markdown = "# Title\n\nbody text\n"
+    result = PictureExtractionResult()
+    out, n = inject_descriptions_inline(markdown, result)
+    assert out == markdown
+    assert n == 0
+
+
+def test_inject_replaces_placeholder_with_caption():
+    """`<!-- image -->` + `Image: <name>` together becomes one block.
+
+    This is the most common medxpertqa case: our renderer puts a caption
+    line right below the embedded JPEG, and Docling preserves both.
+    """
+    markdown = (
+        "# Case\n\n"
+        "Clinical text...\n\n"
+        "<!-- image -->\nImage: MM-130-a.jpeg\n\n"
+        "Answer choices: A) ...\n"
+    )
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "<!-- image -->" not in out
+    assert "Image: MM-130-a.jpeg" not in out  # caption consumed
+    # New format: horizontal-rule-delimited section with "Embedded
+    # image:" anchor and named "Visual description:" section. No
+    # blockquote wrapping -- nested blocks (lists, code, tables) inside
+    # a blockquote are silently dropped by Streamdown / remark.
+    assert "**Embedded image:** `MM-130-a.jpeg`" in out
+    assert "**Visual description:**" in out
+    assert "A CT scan." in out
+    # Block is delimited by horizontal rules so it stands out from
+    # surrounding paragraphs.
+    assert "\n---\n" in out
+    # No OCR section -- this fixture has no ocr_text on its descriptions.
+    assert "**OCR text:**" not in out
+    # No raw HTML tags / blockquote prefixes leak.
+    assert "<image" not in out
+    assert "</image>" not in out
+    assert "> **Embedded image:**" not in out  # we no longer wrap in `>`
+    # Surrounding context is preserved.
+    assert "Clinical text..." in out
+    assert "Answer choices: A) ..." in out
+
+
+def test_inject_uses_pypdf_name_when_no_caption():
+    """`<!-- image -->` alone uses the pypdf-given name as the attribute."""
+    markdown = "# Case\n\n<!-- image -->\n\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `Im0`" in out
+
+
+def test_inject_replaces_bare_caption():
+    """A bare `Image: <name>` line (no placeholder) still gets replaced."""
+    markdown = "# Case\n\nText...\nImage: scan.jpeg\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "Image: scan.jpeg" not in out
+
+
+def test_inject_handles_multiple_images_in_order():
+    """Two placeholders + two descriptions: each consumed in document order."""
+    markdown = (
+        "Page 1\n\n<!-- image -->\nImage: a.jpeg\n\n"
+        "Between\n\n<!-- image -->\nImage: b.jpeg\n\nEnd\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
+                description="Desc A",
+            ),
+            PictureDescription(
+                page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
+                description="Desc B",
+            ),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    assert "**Embedded image:** `a.jpeg`" in out
+    assert "**Embedded image:** `b.jpeg`" in out
+    assert out.index("a.jpeg") < out.index("b.jpeg")
+    assert "Desc A" in out and "Desc B" in out
+
+
+def test_inject_returns_remaining_count_when_more_descriptions_than_markers():
+    """Three descriptions, one marker -> only one inlined, two leftover."""
+    markdown = "Just one <!-- image --> here.\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First"),
+            _desc(name="Im1", description="Second"),
+            _desc(name="Im2", description="Third"),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `Im0`" in out
+    assert "**Embedded image:** `Im1`" not in out
+
+
+def test_inject_returns_zero_when_no_markers_present():
+    """Markdown with no image markers at all returns the input unchanged."""
+    markdown = "# Title\n\nJust text. No images mentioned at all.\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 0
+    assert out == markdown
+
+
+# ---------------------------------------------------------------------------
+# render_appended_section
+# ---------------------------------------------------------------------------
+
+
+def test_render_appended_empty_when_nothing_passed():
+    assert render_appended_section([]) == ""
+
+
+def test_render_appended_renders_each_image_as_block():
+    descriptions = [
+        _desc(name="MM-130-a.jpeg", description="CT scan"),
+        _desc(name="MM-130-b.jpeg", description="Bar chart"),
+    ]
+    rendered = render_appended_section(descriptions)
+    assert "## Image Content (vision-LLM extracted)" in rendered
+    assert "**Embedded image:** `MM-130-a.jpeg`" in rendered
+    assert "CT scan" in rendered
+    assert "**Embedded image:** `MM-130-b.jpeg`" in rendered
+    assert "Bar chart" in rendered
+    # Each image block is delimited by horizontal rules.
+    assert rendered.count("\n---\n") >= 2
+    # No raw HTML / XML / blockquote prefixes.
+    assert "<image" not in rendered
+    assert "> **Embedded image:**" not in rendered
+    assert "**OCR text:**" not in rendered
+
+
+def test_render_appended_includes_skip_notes():
+    descriptions = [_desc()]
+    skip_result = PictureExtractionResult(
+        descriptions=descriptions,
+        skipped_too_small=2,
+        skipped_too_large=1,
+        skipped_duplicate=3,
+        failed=1,
+    )
+    rendered = render_appended_section(descriptions, skip_notes=skip_result)
+    assert "_Note:" in rendered
+    assert "2 too small" in rendered
+    assert "1 too large" in rendered
+    assert "3 duplicate" in rendered
+    assert "1 failed" in rendered
+
+
+# ---------------------------------------------------------------------------
+# merge_descriptions_into_markdown: top-level
+# ---------------------------------------------------------------------------
+
+
+def test_merge_inlines_when_marker_present():
+    markdown = "Text...\n\n<!-- image -->\nImage: scan.jpeg\n\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "**Embedded image:** `scan.jpeg`" in out
+    # Nothing leaked into an appended section -- we should NOT see the
+    # appended-section heading because everything went inline.
+    assert "## Image Content" not in out
+
+
+def test_merge_appends_when_no_marker_present():
+    """Zero markers means everything goes into an appended section."""
+    markdown = "Pure text doc, no image markers.\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="An image desc.")]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "Pure text doc" in out
+    assert "## Image Content (vision-LLM extracted)" in out
+    assert "**Embedded image:** `Im0`" in out
+
+
+def test_merge_appends_leftovers_with_distinct_heading():
+    """One marker, two descriptions -> one inline, second appended under
+    a heading that signals it's a leftover.
+    """
+    markdown = "Text\n\n<!-- image -->\nImage: a.jpeg\n\nEnd\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First"),
+            _desc(name="Im1", description="Second"),
+        ]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "**Embedded image:** `a.jpeg`" in out  # inlined
+    assert "## Image Content (additional, no inline marker found)" in out
+    assert "**Embedded image:** `Im1`" in out  # appended
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: ocr_runner integration
+#
+# These tests cover the per-image OCR side-channel: when the caller
+# supplies an ``ocr_runner`` callable, each extracted image is sent
+# both to the vision LLM (visual description) and to the OCR runner
+# (text-in-image), in parallel. The OCR text -- if any -- is recorded
+# on the PictureDescription and rendered in the inline block.
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_calls_ocr_runner_per_image(tmp_path, mocker):
+    """When an ocr_runner is provided, it's invoked once per eligible image."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img_a, img_b])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=["Visual A", "Visual B"]),
+    )
+    ocr_runner = AsyncMock(side_effect=["OCR text A", "OCR text B"])
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert ocr_runner.await_count == 2
+    by_name = {d.name: d.ocr_text for d in result.descriptions}
+    assert by_name == {"Im0.jpeg": "OCR text A", "Im1.png": "OCR text B"}
+
+
+async def test_describe_pictures_runs_vision_and_ocr_in_parallel(
+    tmp_path, mocker
+):
+    """Vision LLM and OCR run concurrently per image, not sequentially.
+
+    We verify this by recording call timestamps: if both finish within
+    a small window relative to the per-call sleep, they ran in parallel.
+    """
+    import asyncio
+    import time
+
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    sleep_each = 0.05  # 50ms per call
+
+    async def slow_vision(*args, **kwargs):
+        await asyncio.sleep(sleep_each)
+        return "Visual"
+
+    async def slow_ocr(*args, **kwargs):
+        await asyncio.sleep(sleep_each)
+        return "OCR"
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=slow_vision,
+    )
+
+    fake_llm = MagicMock()
+    started = time.perf_counter()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=slow_ocr
+    )
+    elapsed = time.perf_counter() - started
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text == "OCR"
+    # Sequential would be ~2*sleep_each. Parallel is ~1*sleep_each + overhead.
+    # Be generous with the bound so we're not flaky on slow CI.
+    assert elapsed < 1.5 * sleep_each, (
+        f"vision+OCR appear to be sequential (took {elapsed:.3f}s)"
+    )
+
+
+async def test_describe_pictures_treats_empty_ocr_as_none(tmp_path, mocker):
+    """Empty / whitespace-only OCR result is normalised to None.
+
+    This means the rendered image block won't carry an empty
+    "OCR text" section for images that contain no text at all
+    (e.g. a clean radiograph).
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="A radiograph."),
+    )
+    ocr_runner = AsyncMock(return_value="   \n  \n")
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text is None
+
+
+async def test_describe_pictures_swallows_ocr_runner_failure(tmp_path, mocker):
+    """An OCR runner exception must not kill the description for that image.
+
+    OCR is supplementary; the vision LLM's description is the primary
+    payload. If OCR blows up we drop the OCR field for that image and
+    keep the description.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="A radiograph."),
+    )
+    ocr_runner = AsyncMock(side_effect=RuntimeError("OCR backend down"))
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].description == "A radiograph."
+    assert result.descriptions[0].ocr_text is None
+    assert result.failed == 0  # the IMAGE didn't fail; only its OCR did
+
+
+async def test_describe_pictures_vision_failure_with_ocr_runner_skips_image(
+    tmp_path, mocker
+):
+    """If the vision LLM fails, the image is skipped even if OCR succeeded.
+
+    The inline block's primary purpose is the visual description; an
+    OCR-only block would be misleading (it'd look like the vision
+    pipeline ran when it didn't), so we treat vision failure as image
+    failure regardless of OCR outcome.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=RuntimeError("vision blew up")),
+    )
+    ocr_runner = AsyncMock(return_value="OCR text")
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert result.descriptions == []
+    assert result.failed == 1
+
+
+async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(
+    tmp_path, mocker
+):
+    """Backward compat: omitting ocr_runner produces description-only blocks."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="Visual"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text is None
+
+
+# ---------------------------------------------------------------------------
+# Rendering: "OCR text" section appears iff PictureDescription.ocr_text is set
+# ---------------------------------------------------------------------------
+
+
+def _desc_with_ocr(name="Im0", description="A CT scan.", ocr_text="L  R  10mm"):
+    return PictureDescription(
+        page_number=1,
+        ordinal_in_page=0,
+        name=name,
+        sha256="aa",
+        description=description,
+        ocr_text=ocr_text,
+    )
+
+
+def test_inject_renders_ocr_section_when_ocr_text_present():
+    markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc_with_ocr(name="Im0", ocr_text="L  R  10mm")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "**OCR text:**" in out
+    assert "L  R  10mm" in out
+    # OCR section comes before the visual description (literal text
+    # first, interpretation second).
+    assert out.index("**OCR text:**") < out.index("**Visual description:**")
+    # Critical: no nested-block constructs (fenced code, blockquote)
+    # that previous formats relied on -- both broke in Streamdown /
+    # PlateJS by escaping their container and dropping content.
+    assert "```" not in out
+    assert "> **" not in out
+
+
+def test_inject_renders_multiline_ocr_with_hard_breaks():
+    """Multi-line OCR uses trailing-two-spaces hard breaks so each
+    line renders on its own row, without needing a fragile fenced
+    code block or blockquote wrapper."""
+    markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
+    ocr_multi = "Slice 24 / 60\nL\nR\n10 mm"
+    result = PictureExtractionResult(
+        descriptions=[_desc_with_ocr(name="Im0", ocr_text=ocr_multi)]
+    )
+
+    out, _ = inject_descriptions_inline(markdown, result)
+
+    # Every OCR line is present.
+    for line in ("Slice 24 / 60", "L", "R", "10 mm"):
+        assert line in out
+    # Non-last OCR lines get the trailing two-space hard break.
+    assert "Slice 24 / 60  \n" in out
+    assert "\nL  \n" in out
+    assert "\nR  \n" in out
+    # Last OCR line must NOT carry the two-space hard break (no stray <br>).
+    assert "10 mm  \n" not in out
+    assert "10 mm\n" in out
+
+
+def test_render_appended_renders_ocr_section_when_ocr_text_present():
+    descriptions = [
+        _desc_with_ocr(
+            name="MM-130-a.jpeg",
+            description="Axial CT.",
+            ocr_text="Slice 24 / 60",
+        ),
+    ]
+    rendered = render_appended_section(descriptions)
+
+    assert "**OCR text:**" in rendered
+    assert "Slice 24 / 60" in rendered
+    assert "Axial CT." in rendered
+
+
+def test_render_omits_ocr_section_when_ocr_text_is_none():
+    descriptions = [_desc(name="Im0", description="A clean radiograph.")]
+    rendered = render_appended_section(descriptions)
+
+    assert "**Embedded image:** `Im0`" in rendered
+    assert "**OCR text:**" not in rendered
+    assert "**Visual description:**" in rendered
+    # No raw HTML / blockquote prefixes.
+    assert "<image" not in rendered
+    assert "> **" not in rendered
+
+
+# ---------------------------------------------------------------------------
+# inject_descriptions_inline: <figure> blocks (layout-aware parsers)
+#
+# Azure Document Intelligence's ``prebuilt-layout`` and LlamaCloud
+# premium both emit ``<figure>...</figure>`` blocks that already contain
+# the parser's own OCR of the figure (chart bar values, axis labels,
+# inline ``<figcaption>``, embedded ``<table>`` for tabular figures).
+# That parser-side content is useful for retrieval on its own, so we
+# PRESERVE the figure verbatim and append our vision-LLM block
+# immediately after rather than substituting for it.
+# ---------------------------------------------------------------------------
+
+
+def test_inject_appends_block_after_figure_preserving_parser_content():
+    """Figure block stays intact; vision-LLM block goes right after it."""
+    markdown = (
+        "Some narrative text.\n\n"
+        "<figure>\n\n"
+        "Republican\n68\nDemocrat\n30\n"
+        "\n</figure>\n\n"
+        "Following paragraph.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Bar chart of party ID.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # Original figure is preserved verbatim -- the parser's OCR'd
+    # numbers must still be searchable.
+    assert "<figure>" in out
+    assert "</figure>" in out
+    assert "Republican" in out and "68" in out
+    # Our vision-LLM block follows the figure, not before / inside it.
+    assert "**Embedded image:** `Im0`" in out
+    assert "Bar chart of party ID." in out
+    figure_close = out.index("</figure>")
+    embedded_at = out.index("**Embedded image:** `Im0`")
+    assert figure_close < embedded_at, "block must be appended AFTER </figure>"
+    # Surrounding narrative is preserved.
+    assert "Some narrative text." in out
+    assert "Following paragraph." in out
+
+
+def test_inject_handles_multiple_figures_in_document_order():
+    """N figures + N descriptions: each pair lands in the right place."""
+    markdown = (
+        "Page 1\n\n<figure>\nChart A bars\n</figure>\n\n"
+        "Between\n\n<figure>\nChart B bars\n</figure>\n\n"
+        "End.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
+                description="Description of chart A.",
+            ),
+            PictureDescription(
+                page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
+                description="Description of chart B.",
+            ),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    # Both figures preserved; both descriptions inlined; order matches.
+    assert out.count("<figure>") == 2
+    assert out.count("</figure>") == 2
+    assert "Description of chart A." in out
+    assert "Description of chart B." in out
+    assert out.index("Description of chart A.") < out.index(
+        "Description of chart B."
+    )
+    # Each description appears AFTER its corresponding </figure>.
+    first_close = out.index("</figure>")
+    assert first_close < out.index("Description of chart A.")
+    second_close = out.index("</figure>", first_close + 1)
+    assert second_close < out.index("Description of chart B.")
+
+
+def test_inject_figures_with_attributes_and_nested_tags():
+    """``<figure>`` with attributes and nested tags is matched and preserved."""
+    markdown = (
+        '<figure id="fig-3" class="chart">\n'
+        '<figcaption>Source: Pew Research</figcaption>\n'
+        "<table><tr><td>Republican</td><td>57</td></tr></table>\n"
+        "</figure>\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Survey table.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # All nested HTML is preserved (chunking will pick it up).
+    assert 'id="fig-3"' in out
+    assert "<figcaption>Source: Pew Research</figcaption>" in out
+    assert "<table>" in out and "Republican" in out and "57" in out
+    # Our block sits after the closing tag.
+    assert out.index("</figure>") < out.index("**Embedded image:** `Im0`")
+
+
+def test_inject_figures_more_descriptions_than_figures_returns_remaining():
+    """Three descriptions, one figure -> one inlined, two left for caller."""
+    markdown = "Text.\n<figure>\nbar values\n</figure>\nMore.\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First desc."),
+            _desc(name="Im1", description="Second desc."),
+            _desc(name="Im2", description="Third desc."),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "First desc." in out
+    # Leftovers are the caller's job; inject_descriptions_inline does
+    # not append them on its own.
+    assert "Second desc." not in out
+    assert "Third desc." not in out
+
+
+def test_inject_figures_more_figures_than_descriptions_leaves_extras_untouched():
+    """Two figures, one description -> first figure enriched, second left raw."""
+    markdown = (
+        "<figure>\nfigure 1 content\n</figure>\n"
+        "<figure>\nfigure 2 content\n</figure>\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Only description.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # Both figures still present; only the first one was enriched.
+    assert out.count("<figure>") == 2
+    assert "Only description." in out
+    # Second figure has no embedded-image block immediately after it.
+    second_open = out.index("<figure>", out.index("<figure>") + 1)
+    second_close = out.index("</figure>", second_open)
+    after_second = out[second_close:]
+    assert "**Embedded image:**" not in after_second
+
+
+def test_merge_inlines_at_figure_boundary():
+    """Top-level helper does the right thing with figures (no leftover section)."""
+    markdown = "Lead.\n<figure>\nbars\n</figure>\nTrailer.\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Bar chart.")]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    # Inline succeeded -> no appended-section heading.
+    assert "## Image Content" not in out
+    assert "Bar chart." in out
+    assert "<figure>" in out and "</figure>" in out
+
+
+def test_inject_figures_then_falls_through_to_docling_marker():
+    """Mixed-marker doc: figure consumed first, then Docling placeholder.
+
+    Defensive -- single docs are usually one parser's output, but if a
+    pipeline ever stitches two parsers' markdowns together the inliner
+    should still place each description.
+    """
+    markdown = (
+        "<figure>\nChart bars: 50, 40, 30\n</figure>\n\n"
+        "Later in the doc:\n\n"
+        "<!-- image -->\nImage: scan.jpeg\n\n"
+        "End.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="Chart description."),
+            _desc(name="Im1", description="Scan description."),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    # Figure preserved + augmented.
+    assert "<figure>" in out and "Chart bars: 50, 40, 30" in out
+    assert "Chart description." in out
+    # Docling placeholder + caption replaced.
+    assert "<!-- image -->" not in out
+    assert "Image: scan.jpeg" not in out
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "Scan description." in out
--- a/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
@ -0,0 +1,146 @@
+"""Unit tests for the vision_llm parser helpers.
+
+Two helpers exist:
+
+- :func:`parse_with_vision_llm` -- single-shot for standalone image
+  uploads (.png/.jpg/etc). Returns combined markdown (description +
+  verbatim OCR mixed) since the image *is* the document.
+- :func:`parse_image_for_description` -- per-image-in-PDF call. Returns
+  visual description only; OCR is the ETL service's job.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# parse_with_vision_llm: legacy single-shot path
+# ---------------------------------------------------------------------------
+
+
+async def test_parse_with_vision_llm_returns_combined_markdown(tmp_path):
+    """Standalone image uploads still go through the combined-markdown path."""
+    from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "# A scan of something."
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    out = await parse_with_vision_llm(str(img), "scan.png", fake_llm)
+    assert out == "# A scan of something."
+    fake_llm.ainvoke.assert_awaited_once()
+
+
+async def test_parse_with_vision_llm_rejects_empty_response(tmp_path):
+    """An empty model response raises rather than silently returning blanks."""
+    from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = ""
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    with pytest.raises(ValueError, match="empty content"):
+        await parse_with_vision_llm(str(img), "scan.png", fake_llm)
+
+
+# ---------------------------------------------------------------------------
+# parse_image_for_description: per-image-in-PDF, description only
+# ---------------------------------------------------------------------------
+
+
+async def test_parse_image_for_description_returns_description(tmp_path):
+    """Description-only path returns the model's markdown unchanged."""
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "Axial CT showing a large cystic mass."
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    out = await parse_image_for_description(str(img), "scan.png", fake_llm)
+    assert out == "Axial CT showing a large cystic mass."
+
+
+async def test_parse_image_for_description_uses_description_only_prompt(tmp_path):
+    """The prompt explicitly tells the model NOT to transcribe text.
+
+    This is the contract that lets us drop OCR from the response: the
+    ETL pipeline already has the text (from page-level OCR), so asking
+    the vision LLM for it would be redundant cost.
+    """
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "A description"
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    await parse_image_for_description(str(img), "scan.png", fake_llm)
+
+    # The prompt is the first text part of the message we sent.
+    sent_messages = fake_llm.ainvoke.call_args.args[0]
+    prompt_text = sent_messages[0].content[0]["text"].lower()
+    assert "describe what this image visually depicts" in prompt_text
+    assert "do not transcribe text" in prompt_text
+
+
+async def test_parse_image_for_description_rejects_empty(tmp_path):
+    """Empty response surfaces as ValueError so the caller can skip the image."""
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "   "  # whitespace-only counts as empty
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    with pytest.raises(ValueError, match="empty content"):
+        await parse_image_for_description(str(img), "scan.png", fake_llm)
+
+
+# ---------------------------------------------------------------------------
+# Image size + extension validation (shared by both paths)
+# ---------------------------------------------------------------------------
+
+
+def test_image_to_data_url_rejects_oversized(tmp_path):
+    """Images larger than 5 MB raise before any LLM call is made."""
+    from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
+
+    big = tmp_path / "huge.png"
+    big.write_bytes(b"\x89PNG" + b"\x00" * (6 * 1024 * 1024))
+
+    with pytest.raises(ValueError, match="Image too large"):
+        _image_to_data_url(str(big))
+
+
+def test_image_to_data_url_rejects_unsupported_extension(tmp_path):
+    """Unknown extensions raise rather than guessing a MIME type."""
+    from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
+
+    weird = tmp_path / "scan.xyz"
+    weird.write_bytes(b"\x00" * 100)
+
+    with pytest.raises(ValueError, match="Unsupported image extension"):
+        _image_to_data_url(str(weird))