Merge remote-tracking branch 'upstream/dev' into feature/multi-agent-with-task-parallelization

2026-05-17 18:35:19 +02:00 · 2026-05-15 16:44:22 +02:00 · 2026-05-15 16:44:22 +02:00 · 4980f9f1ba
commit 4980f9f1ba
parent 5327f3348c eea2d68098
193 changed files with 32777 additions and 565 deletions
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@ -31,7 +31,7 @@ jobs:
      new_tag: ${{ steps.tag_version.outputs.next_version }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.branch }}
@ -108,16 +108,18 @@ jobs:
            name: surfsense-backend
            context: ./surfsense_backend
            file: ./surfsense_backend/Dockerfile
+            target: production
          - image: web
            name: surfsense-web
            context: ./surfsense_web
            file: ./surfsense_web/Dockerfile
+            target: runner
    env:
      REGISTRY_IMAGE: ghcr.io/${{ github.repository_owner }}/${{ matrix.name }}

    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Set lowercase image name
        id: image
@ -125,19 +127,19 @@ jobs:

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@v5
+        uses: docker/metadata-action@v6
        with:
          images: ${{ steps.image.outputs.name }}

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
+        uses: docker/login-action@v4
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v4

      - name: Free up disk space
        run: |
@ -149,10 +151,11 @@ jobs:

      - name: Build and push by digest ${{ matrix.name }} (${{ matrix.suffix }})
        id: build
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v7
        with:
          context: ${{ matrix.context }}
          file: ${{ matrix.file }}
+          target: ${{ matrix.target }}
          labels: ${{ steps.meta.outputs.labels }}
          tags: ${{ steps.image.outputs.name }}
          outputs: type=image,push-by-digest=true,name-canonical=true,push=true
@ -174,7 +177,7 @@ jobs:
          touch "/tmp/digests/${digest#sha256:}"

      - name: Upload digest
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v7
        with:
          name: digests-${{ matrix.image }}-${{ matrix.suffix }}
          path: /tmp/digests/*
@ -205,22 +208,22 @@ jobs:
        run: echo "name=${REGISTRY_IMAGE,,}" >> $GITHUB_OUTPUT

      - name: Download amd64 digest
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v8
        with:
          name: digests-${{ matrix.image }}-amd64
          path: /tmp/digests

      - name: Download arm64 digest
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v8
        with:
          name: digests-${{ matrix.image }}-arm64
          path: /tmp/digests

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v4

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
+        uses: docker/login-action@v4
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
@ -239,7 +242,7 @@ jobs:

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@v5
+        uses: docker/metadata-action@v6
        with:
          images: ${{ steps.image.outputs.name }}
          tags: |
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@ -0,0 +1,174 @@
+name: E2E Tests
+
+on:
+  pull_request:
+    branches: [main, dev]
+    types: [opened, synchronize, reopened, ready_for_review]
+    paths:
+      - 'surfsense_web/**'
+      - 'surfsense_backend/**'
+      - 'docker/docker-compose.e2e.yml'
+      - '.github/workflows/e2e-tests.yml'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    name: Journey
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.draft == false
+    timeout-minutes: 30
+
+    env:
+      # Test user that the backend creates via /auth/register before Playwright runs.
+      PLAYWRIGHT_TEST_EMAIL: e2e-test@surfsense.net
+      PLAYWRIGHT_TEST_PASSWORD: E2eTestPassword123!
+      # Frontend env: Playwright's webServer (surfsense_web/playwright.config.ts)
+      # spawns `pnpm build && pnpm start` in CI; these get baked into the build.
+      NEXT_PUBLIC_FASTAPI_BACKEND_URL: http://localhost:8000
+      NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: LOCAL
+      # Shared secret for the test-only POST /__e2e__/auth/token endpoint.
+      # Must match docker-compose.e2e.yml's backend env (x-backend-env).
+      E2E_MINT_SECRET: e2e-mint-secret-not-for-production
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v4
+
+      # ─── Backend stack ─────────────────────────────────────────────────
+      # Builds the e2e image (multi-stage, deps cached via GHA), brings up
+      # db + redis + backend + celery_worker, blocks until every healthcheck
+      # is green. No `uv` invocation on the runner; no PID files; no curl
+      # polling loops; readiness is gated by Docker healthchecks.
+      - name: Build & start backend stack
+        run: |
+          docker compose -f docker/docker-compose.e2e.yml \
+            up -d --build --wait --wait-timeout 300
+
+      - name: Show backend stack status
+        if: always()
+        run: docker compose -f docker/docker-compose.e2e.yml ps
+
+      - name: Register E2E test user
+        run: |
+          # 200/201 = created, 400 = already exists (idempotent across reruns).
+          STATUS=$(curl -s -o /tmp/register.json -w "%{http_code}" \
+            -X POST http://localhost:8000/auth/register \
+            -H "Content-Type: application/json" \
+            -d "{\"email\":\"${PLAYWRIGHT_TEST_EMAIL}\",\"password\":\"${PLAYWRIGHT_TEST_PASSWORD}\"}")
+          echo "Register status: ${STATUS}"
+          cat /tmp/register.json
+          if [ "${STATUS}" != "200" ] && [ "${STATUS}" != "201" ] && [ "${STATUS}" != "400" ]; then
+            echo "::error::Failed to register test user (status ${STATUS})"
+            exit 1
+          fi
+
+          # Flush auth rate-limit counters so Playwright starts clean.
+          docker compose -f docker/docker-compose.e2e.yml exec -T redis \
+            sh -c "redis-cli --scan --pattern 'surfsense:auth_rate_limit:*' \
+              | xargs -r redis-cli DEL" || true
+
+      # ─── Frontend (host-side) ──────────────────────────────────────────
+      # Playwright's webServer block in playwright.config.ts spawns
+      # `pnpm build && pnpm start` in CI mode and waits for :3000.
+      - uses: actions/setup-node@v6
+        with:
+          node-version: '20'
+
+      - uses: pnpm/action-setup@v6
+
+      - name: Get pnpm store directory
+        id: pnpm-cache
+        shell: bash
+        run: echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_OUTPUT
+
+      - name: Cache pnpm store
+        uses: actions/cache@v5
+        with:
+          path: ${{ steps.pnpm-cache.outputs.STORE_PATH }}
+          key: pnpm-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}
+          restore-keys: pnpm-${{ runner.os }}-
+
+      - name: Install web dependencies
+        working-directory: surfsense_web
+        run: pnpm install --frozen-lockfile
+
+      - name: Cache Playwright browsers
+        id: playwright-cache
+        uses: actions/cache@v5
+        with:
+          path: ~/.cache/ms-playwright
+          key: playwright-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}
+
+      - name: Install Playwright browsers
+        if: steps.playwright-cache.outputs.cache-hit != 'true'
+        working-directory: surfsense_web
+        run: pnpm exec playwright install --with-deps chromium
+
+      - name: Install Playwright system deps (cache hit)
+        if: steps.playwright-cache.outputs.cache-hit == 'true'
+        working-directory: surfsense_web
+        run: pnpm exec playwright install-deps chromium
+
+      - name: Cache Next.js build
+        uses: actions/cache@v5
+        with:
+          path: surfsense_web/.next/cache
+          key: nextjs-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}-${{ github.sha }}
+          restore-keys: |
+            nextjs-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}-
+            nextjs-${{ runner.os }}-
+
+      # ─── Tests ─────────────────────────────────────────────────────────
+      - name: Run Playwright tests
+        working-directory: surfsense_web
+        run: pnpm test:e2e:prod
+
+      # ─── Failure diagnostics ───────────────────────────────────────────
+      - name: Dump backend stack logs on failure
+        if: ${{ failure() || cancelled() }}
+        run: |
+          mkdir -p ./compose-logs
+          docker compose -f docker/docker-compose.e2e.yml logs --no-color --timestamps \
+            > ./compose-logs/all-services.log 2>&1 || true
+          for svc in db redis backend celery_worker; do
+            docker compose -f docker/docker-compose.e2e.yml logs --no-color --timestamps "$svc" \
+              > "./compose-logs/${svc}.log" 2>&1 || true
+          done
+          docker compose -f docker/docker-compose.e2e.yml ps \
+            > ./compose-logs/ps.txt 2>&1 || true
+
+      # ─── Artifacts ─────────────────────────────────────────────────────
+      - name: Upload Playwright HTML report
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: playwright-report
+          path: surfsense_web/playwright-report/
+          retention-days: 14
+
+      - name: Upload Playwright traces
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: playwright-traces
+          path: surfsense_web/test-results/
+          retention-days: 14
+
+      - name: Upload backend stack logs
+        if: ${{ failure() || cancelled() }}
+        uses: actions/upload-artifact@v7
+        with:
+          name: backend-stack-logs
+          path: ./compose-logs/
+          retention-days: 7
+
+      # ─── Teardown ──────────────────────────────────────────────────────
+      - name: Tear down backend stack
+        if: always()
+        run: docker compose -f docker/docker-compose.e2e.yml down -v --remove-orphans
--- a/.gitignore
+++ b/.gitignore
@ -17,3 +17,5 @@ surfsense_web/test-results/
 surfsense_web/blob-report/
 hermes-agent
 hermes-agent/
+
+content_research/
--- a/docker/.env.example
+++ b/docker/.env.example
@ -4,7 +4,7 @@
 # Database, Redis, and internal service wiring are handled automatically.
 # ==============================================================================

-# SurfSense version (use "latest", a clean version like "0.0.14", or a specific build like "0.0.14.1")
+# SurfSense version (use "latest" or a specific version like "0.0.14")
 SURFSENSE_VERSION=latest

 # ------------------------------------------------------------------------------
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@ -10,6 +10,11 @@

 name: surfsense-dev

+x-backend-build: &backend-build
+  context: ../surfsense_backend
+  args:
+    EMBEDDING_MODEL: ${EMBEDDING_MODEL:-sentence-transformers/all-MiniLM-L6-v2}
+
 services:
  db:
    image: pgvector/pgvector:pg17
@ -69,7 +74,7 @@ services:
      retries: 5

  backend:
-    build: ../surfsense_backend
+    build: *backend-build
    ports:
      - "${BACKEND_PORT:-8000}:8000"
    volumes:
@ -114,7 +119,7 @@ services:
      start_period: 200s

  celery_worker:
-    build: ../surfsense_backend
+    build: *backend-build
    volumes:
      - ../surfsense_backend/app:/app/app
      - shared_temp:/shared_tmp
@ -140,7 +145,7 @@ services:
        condition: service_healthy

  celery_beat:
-    build: ../surfsense_backend
+    build: *backend-build
    env_file:
      - ../surfsense_backend/.env
    environment:
@ -159,7 +164,7 @@ services:
        condition: service_started

  # flower:
-  #   build: ../surfsense_backend
+  #   build: *backend-build
  #   ports:
  #     - "${FLOWER_PORT:-5555}:5555"
  #   env_file:
--- a/docker/docker-compose.e2e.yml
+++ b/docker/docker-compose.e2e.yml
@ -0,0 +1,181 @@
+# =============================================================================
+# SurfSense — E2E Docker Compose stack
+# =============================================================================
+# Hermetic backend stack for Playwright E2E tests:
+#   - db / redis on an internal-only network (no internet egress)
+#   - backend (FastAPI) joins the internal network AND a separate ingress
+#     bridge so the host runner can reach :8000
+#   - celery_worker on the internal network only — zero egress surface
+#
+# The backend image is built from surfsense_backend/Dockerfile target=e2e,
+# which adds tests/ via the `tests-source` additional context (tests/ is
+# excluded from the main context by .dockerignore so production never ships
+# test fakes). See surfsense_backend/Dockerfile for stage layout.
+#
+# Usage from repo root:
+#   docker compose -f docker/docker-compose.e2e.yml up -d --build --wait
+#   curl -X POST http://localhost:8000/auth/register ...
+#   ( run Playwright on host, pointing at localhost:8000 + localhost:3000 )
+#   docker compose -f docker/docker-compose.e2e.yml down -v
+# =============================================================================
+
+name: surfsense-e2e
+
+x-backend-env: &backend-env
+  DATABASE_URL: postgresql+asyncpg://postgres:postgres@db:5432/surfsense_e2e
+  CELERY_BROKER_URL: redis://redis:6379/0
+  CELERY_RESULT_BACKEND: redis://redis:6379/0
+  REDIS_APP_URL: redis://redis:6379/0
+  CELERY_TASK_DEFAULT_QUEUE: surfsense
+  SECRET_KEY: ci-test-secret-key-not-for-production
+  AUTH_TYPE: LOCAL
+  REGISTRATION_ENABLED: "TRUE"
+  ETL_SERVICE: DOCLING
+  EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
+  NEXT_FRONTEND_URL: http://host.docker.internal:3000
+  # Sentinel keys — fakes never read them; turns leaked real calls into 401s.
+  COMPOSIO_API_KEY: e2e-deny-real-call-sentinel
+  COMPOSIO_ENABLED: "TRUE"
+  OPENAI_API_KEY: e2e-deny-real-call-sentinel
+  ANTHROPIC_API_KEY: e2e-deny-real-call-sentinel
+  LITELLM_API_KEY: e2e-deny-real-call-sentinel
+  MICROSOFT_CLIENT_ID: fake-microsoft-client-id
+  MICROSOFT_CLIENT_SECRET: fake-microsoft-client-secret
+  ONEDRIVE_REDIRECT_URI: http://localhost:8000/api/v1/auth/onedrive/connector/callback
+  DROPBOX_APP_KEY: fake-dropbox-app-key
+  DROPBOX_APP_SECRET: fake-dropbox-app-secret
+  DROPBOX_REDIRECT_URI: http://localhost:8000/api/v1/auth/dropbox/connector/callback
+  # Defense-in-depth: even though L3 egress is denied for the worker via
+  # `internal: true`, the backend still has a route via `ingress`. Setting
+  # HTTPS_PROXY to an unreachable port turns any leaked Python outbound HTTP
+  # call into a fast Connection refused. UNLIKE the old runner-shell setup,
+  # this proxy is set on the container env and `uv` is never invoked here,
+  # so there is no interaction with uv's implicit-sync behaviour.
+  HTTPS_PROXY: http://127.0.0.1:1
+  HTTP_PROXY: http://127.0.0.1:1
+  NO_PROXY: localhost,127.0.0.1,0.0.0.0,db,redis,host.docker.internal
+  HF_HUB_OFFLINE: "1"
+  TRANSFORMERS_OFFLINE: "1"
+  # Test-only token-mint endpoint secret (see tests/e2e/run_backend.py).
+  E2E_MINT_SECRET: e2e-mint-secret-not-for-production
+
+services:
+  db:
+    image: pgvector/pgvector:pg17
+    command: >
+      postgres
+        -c wal_level=logical
+        -c max_wal_senders=10
+        -c max_replication_slots=10
+    environment:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_DB: surfsense_e2e
+    # Ephemeral storage — every CI run gets a clean DB, no volume cleanup needed.
+    tmpfs:
+      - /var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U postgres -d surfsense_e2e"]
+      interval: 2s
+      timeout: 3s
+      retries: 30
+    networks: [internal]
+
+  redis:
+    image: redis:8-alpine
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 2s
+      timeout: 3s
+      retries: 30
+    networks: [internal]
+
+  backend:
+    build:
+      context: ../surfsense_backend
+      dockerfile: Dockerfile
+      target: e2e
+      additional_contexts:
+        # tests/ is excluded from the main context by .dockerignore;
+        # the e2e stage's `COPY --from=tests-source` pulls it in here.
+        tests-source: ../surfsense_backend/tests
+      args:
+        EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
+      cache_from:
+        - type=gha,scope=surfsense-e2e-backend
+      cache_to:
+        - type=gha,mode=max,scope=surfsense-e2e-backend
+    image: surfsense-e2e-backend:local
+    environment:
+      <<: *backend-env
+      SERVICE_ROLE: api
+    volumes:
+      - shared_temp:/shared_tmp
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    ports:
+      - "8000:8000"
+    depends_on:
+      db: { condition: service_healthy }
+      redis: { condition: service_healthy }
+    healthcheck:
+      # Use Python (already in the image) instead of curl/wget to avoid
+      # depending on either tool being installed in the runtime layers.
+      test:
+        - CMD
+        - python
+        - -c
+        - |
+          import sys, urllib.request
+          try:
+              r = urllib.request.urlopen("http://localhost:8000/openapi.json", timeout=2)
+              sys.exit(0 if r.status == 200 else 1)
+          except Exception:
+              sys.exit(1)
+      interval: 3s
+      timeout: 5s
+      retries: 60
+      start_period: 30s
+    networks:
+      - internal      # to reach db/redis
+      - ingress       # so host can reach :8000
+
+  celery_worker:
+    image: surfsense-e2e-backend:local
+    pull_policy: never
+    # No build: section — reuses the image built by the `backend` service.
+    # Compose v2 builds shared images exactly once across services that
+    # reference the same `image:` tag.
+    environment:
+      <<: *backend-env
+      SERVICE_ROLE: worker
+    volumes:
+      - shared_temp:/shared_tmp
+    depends_on:
+      backend: { condition: service_healthy }
+    healthcheck:
+      test:
+        - CMD-SHELL
+        - "celery -A app.celery_app inspect ping --timeout 2 | grep -q pong"
+      interval: 5s
+      timeout: 5s
+      retries: 12
+      start_period: 20s
+    networks: [internal]
+
+networks:
+  # Internal network: containers attached only to this network have NO route
+  # to the host or the internet. This is the L3 deny-egress mechanism that
+  # replaces the fragile HTTPS_PROXY-on-the-runner approach.
+  internal:
+    driver: bridge
+    internal: true
+
+  # Regular bridge network. Only the `backend` service joins it, solely so
+  # the host can reach :8000 via the published port. celery_worker / db /
+  # redis stay off this network entirely.
+  ingress:
+    driver: bridge
+
+volumes:
+  shared_temp:
--- a/package.json
+++ b/package.json
@ -1,5 +1,5 @@
 {
  "name": "surfsense",
  "private": true,
-  "packageManager": "pnpm@10.24.0"
+  "packageManager": "pnpm@10.26.0"
 }
--- a/surfsense_backend/.gitignore
+++ b/surfsense_backend/.gitignore
@ -13,5 +13,5 @@ celerybeat-schedule*
 celerybeat-schedule.*
 celerybeat-schedule.dir
 celerybeat-schedule.bak
-global_llm_config.yaml
+/app/config/global_llm_config.yaml
 app/templates/_generated/
--- a/surfsense_backend/Dockerfile
+++ b/surfsense_backend/Dockerfile
@ -1,8 +1,16 @@
-FROM python:3.12-slim
+# =============================================================================
+# SurfSense Backend — Multi-stage Dockerfile
+# =============================================================================
+# Graph: base → deps → models → {e2e, production}
+#   e2e        — tests/ via additional_contexts (docker-compose.e2e.yml)
+#   production — published ghcr.io image (docker-build.yml pins target)
+# =============================================================================
+
+# ─── Stage 1: base (system deps, Pandoc, certificates) ──────────────────────
+FROM python:3.12-slim AS base

 WORKDIR /app

-# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
 RUN apt-get update && apt-get install -y --no-install-recommends \
    gcc \
    python3-dev \
@ -11,6 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    wget \
    unzip \
    gnupg2 \
+    ffmpeg \
    espeak-ng \
    libsndfile1 \
    libgl1 \
@ -22,21 +31,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    && rm -rf /var/lib/apt/lists/*

-# Install Pandoc 3.x from GitHub as a fallback for Linux where pypandoc_binary
-# may not bundle pandoc (apt ships 2.17 which has broken table rendering).
-# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks this up.
+RUN which ffmpeg && ffmpeg -version
+
+# Pandoc 3.x from GitHub Releases — apt ships 2.17 which has broken table rendering.
+# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks up this binary.
 RUN ARCH=$(dpkg --print-architecture) && \
    wget -qO /tmp/pandoc.deb "https://github.com/jgm/pandoc/releases/download/3.9/pandoc-3.9-1-${ARCH}.deb" && \
    dpkg -i /tmp/pandoc.deb && \
    rm /tmp/pandoc.deb

-# Update certificates and install SSL tools
 RUN update-ca-certificates
 RUN pip install --upgrade certifi pip-system-certs

-# Copy requirements
-COPY pyproject.toml .
-COPY uv.lock .
+ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+ENV SURFSENSE_ALLOW_STATIC_FFMPEG_DOWNLOAD=FALSE
+
+
+# ─── Stage 2: deps (Python deps frozen from uv.lock) ────────────────────────
+FROM base AS deps
+
+COPY pyproject.toml uv.lock ./

 # Install all Python dependencies from uv.lock for deterministic builds.
 #
@ -49,9 +64,7 @@ COPY uv.lock .
 # Note on torch/CUDA: we do NOT install torch from a separate cu* index here.
 # PyPI's torch wheels for Linux x86_64 already ship CUDA-enabled and pull
 # nvidia-cudnn-cu13, nvidia-nccl-cu13, triton, etc. as install deps (all
-# captured in uv.lock). Installing from cu121 first only wasted ~2GB of
-# downloads that the lock-based install immediately replaced. If a specific
-# CUDA version is needed (driver compatibility, etc.), wire it through
+# captured in uv.lock). If a specific CUDA version is needed, wire it through
 # [tool.uv.sources] in pyproject.toml so the lock stays the source of truth.
 RUN pip install --no-cache-dir uv && \
    uv export --frozen --no-dev --no-hashes --no-emit-project \
@ -59,49 +72,42 @@ RUN pip install --no-cache-dir uv && \
    uv pip install --system --no-cache-dir -r /tmp/requirements.txt && \
    rm /tmp/requirements.txt

-# Set SSL environment variables dynamically
-RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
-    echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
-    echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
-    echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
-ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
-ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
+
+# ─── Stage 3: models (pre-baked offline assets) ─────────────────────────────
+FROM deps AS models

 # Pre-download EasyOCR models to avoid runtime SSL issues
-RUN mkdir -p /root/.EasyOCR/model
-RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
-RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
-RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
+RUN mkdir -p /root/.EasyOCR/model && \
+    wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip      -O /root/.EasyOCR/model/english_g2.zip      || true && \
+    wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true && \
+    cd /root/.EasyOCR/model && \
+    (unzip -o english_g2.zip || true) && \
+    (unzip -o craft_mlt_25k.zip || true)

 # Pre-download Docling models
-RUN python -c "try:\n    from docling.document_converter import DocumentConverter\n    conv = DocumentConverter()\nexcept:\n    pass" || true
+RUN printf '%s\n' \
+    'try:' \
+    '    from docling.document_converter import DocumentConverter' \
+    '    DocumentConverter()' \
+    'except Exception:' \
+    '    pass' \
+    | python || true

-# Install Playwright browsers for web scraping (the playwright package itself
-# is already installed via uv.lock above)
+ARG EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+RUN python -c "from chonkie import AutoEmbeddings; AutoEmbeddings.get_embeddings('${EMBEDDING_MODEL}')"
+
+# Install Playwright browsers (the playwright python package itself is in deps)
 RUN playwright install chromium --with-deps

-# Copy source code
-COPY . .
-
-# Install the project itself in editable mode. Dependencies were already
-# installed deterministically from uv.lock above, so --no-deps prevents any
-# re-resolution that could pull newer versions.
-RUN uv pip install --system --no-cache-dir --no-deps -e .
-
-# Copy and set permissions for entrypoint script
-# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
-COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
-RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
-
 # Shared temp directory for file uploads between API and Worker containers.
 # Python's tempfile module uses TMPDIR, so uploaded files land here.
 # Mount the SAME volume at /shared_tmp on both API and Worker in Coolify.
 RUN mkdir -p /shared_tmp
-ENV TMPDIR=/shared_tmp

-# Prevent uvloop compatibility issues
 ENV PYTHONPATH=/app
 ENV UVICORN_LOOP=asyncio
+ENV TMPDIR=/shared_tmp
+ENV PYTHONUNBUFFERED=1

 # Tune glibc malloc to return freed memory to the OS more aggressively.
 # Without these, Python's gc.collect() frees objects but the underlying
@ -110,6 +116,56 @@ ENV MALLOC_MMAP_THRESHOLD_=65536
 ENV MALLOC_TRIM_THRESHOLD_=131072
 ENV MALLOC_MMAP_MAX_=65536

+
+# ─── Stage 4: e2e (production source + tests/ + e2e entrypoint) ─────────────
+# Built via `docker buildx build --target e2e`. The default build target is
+# `production` (the last stage), so this stage is opt-in for CI only.
+#
+# `tests/` is excluded from the main build context by .dockerignore (so prod
+# can never accidentally ship test fakes). The e2e stage receives tests/
+# through an "additional context" passed by docker-compose.e2e.yml — see
+# https://docs.docker.com/reference/compose-file/build/#additional_contexts
+FROM models AS e2e
+
+# Same source copy as production. .dockerignore filters out tests/.
+COPY . .
+
+# Bring tests/ in via the named additional build context. CI passes
+#   --build-context tests-source=./tests
+# (or the equivalent additional_contexts entry in docker-compose.e2e.yml).
+COPY --from=tests-source . ./tests/
+
+# Install the project itself in editable mode. Dependencies were already
+# installed deterministically from uv.lock above, so --no-deps prevents any
+# re-resolution that could pull newer versions.
+RUN uv pip install --system --no-cache-dir --no-deps -e .
+
+COPY scripts/docker/entrypoint.e2e.sh /app/scripts/docker/entrypoint.e2e.sh
+RUN dos2unix /app/scripts/docker/entrypoint.e2e.sh && chmod +x /app/scripts/docker/entrypoint.e2e.sh
+
+# SERVICE_ROLE is overridden per service in docker-compose.e2e.yml (api / worker).
+ENV SERVICE_ROLE=api
+
+EXPOSE 8000-8001
+CMD ["/app/scripts/docker/entrypoint.e2e.sh"]
+
+
+# ─── Stage 5: production (published ghcr.io image) ──────────────────────────
+# CI pins `target: production`; also the default for `docker build` / dev compose.
+FROM models AS production
+
+# Copy source code (tests/ excluded by .dockerignore — production never ships tests).
+COPY . .
+
+# Install the project itself in editable mode. Dependencies were already
+# installed deterministically from uv.lock above, so --no-deps prevents any
+# re-resolution that could pull newer versions.
+RUN uv pip install --system --no-cache-dir --no-deps -e .
+
+# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
+COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
+RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
+
 # SERVICE_ROLE controls which process this container runs:
 #   api     – FastAPI backend only (runs migrations on startup)
 #   worker  – Celery worker only
@ -127,6 +183,5 @@ ENV CELERY_MAX_TASKS_PER_CHILD=50
 #   ""                       – both queues (default, for single-worker setups)
 ENV CELERY_QUEUES=""

-# Run
 EXPOSE 8000-8001
-CMD ["/app/scripts/docker/entrypoint.sh"]
+CMD ["/app/scripts/docker/entrypoint.sh"]
--- a/surfsense_backend/alembic/env.py
+++ b/surfsense_backend/alembic/env.py
@ -67,7 +67,11 @@ def run_migrations_offline() -> None:


 def do_run_migrations(connection: Connection) -> None:
-    context.configure(connection=connection, target_metadata=target_metadata)
+    context.configure(
+        connection=connection,
+        target_metadata=target_metadata,
+        transaction_per_migration=True,
+    )

    with context.begin_transaction():
        context.run_migrations()
--- a/surfsense_backend/alembic/versions/130_add_agent_action_log.py
+++ b/surfsense_backend/alembic/versions/130_add_agent_action_log.py
@ -26,6 +26,10 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
+    bind = op.get_bind()
+    if sa.inspect(bind).has_table("agent_action_log"):
+        return
+
    op.create_table(
        "agent_action_log",
        sa.Column("id", sa.Integer(), primary_key=True, index=True),
--- a/surfsense_backend/alembic/versions/131_add_document_revisions.py
+++ b/surfsense_backend/alembic/versions/131_add_document_revisions.py
@ -29,6 +29,21 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+
+    if inspector.has_table("document_revisions") and inspector.has_table(
+        "folder_revisions"
+    ):
+        return
+
+    if not inspector.has_table("document_revisions"):
+        _create_document_revisions()
+    if not inspector.has_table("folder_revisions"):
+        _create_folder_revisions()
+
+
+def _create_document_revisions() -> None:
    op.create_table(
        "document_revisions",
        sa.Column("id", sa.Integer(), primary_key=True, index=True),
@ -74,6 +89,8 @@ def upgrade() -> None:
        ),
    )

+
+def _create_folder_revisions() -> None:
    op.create_table(
        "folder_revisions",
        sa.Column("id", sa.Integer(), primary_key=True, index=True),
--- a/surfsense_backend/alembic/versions/132_add_agent_permission_rules.py
+++ b/surfsense_backend/alembic/versions/132_add_agent_permission_rules.py
@ -26,6 +26,10 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
+    bind = op.get_bind()
+    if sa.inspect(bind).has_table("agent_permission_rules"):
+        return
+
    op.create_table(
        "agent_permission_rules",
        sa.Column("id", sa.Integer(), primary_key=True, index=True),
--- a/surfsense_backend/alembic/versions/135_action_log_correlation_ids.py
+++ b/surfsense_backend/alembic/versions/135_action_log_correlation_ids.py
@ -50,29 +50,39 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
-    op.add_column(
-        "agent_action_log",
-        sa.Column("tool_call_id", sa.String(length=64), nullable=True),
-    )
-    op.add_column(
-        "agent_action_log",
-        sa.Column("chat_turn_id", sa.String(length=64), nullable=True),
-    )
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+    columns = {c["name"] for c in inspector.get_columns("agent_action_log")}
+    indexes = {i["name"] for i in inspector.get_indexes("agent_action_log")}

-    op.create_index(
-        "ix_agent_action_log_tool_call_id",
-        "agent_action_log",
-        ["tool_call_id"],
-    )
-    op.create_index(
-        "ix_agent_action_log_chat_turn_id",
-        "agent_action_log",
-        ["chat_turn_id"],
-    )
+    if "tool_call_id" not in columns:
+        op.add_column(
+            "agent_action_log",
+            sa.Column("tool_call_id", sa.String(length=64), nullable=True),
+        )
+    if "chat_turn_id" not in columns:
+        op.add_column(
+            "agent_action_log",
+            sa.Column("chat_turn_id", sa.String(length=64), nullable=True),
+        )

-    op.execute(
-        "UPDATE agent_action_log SET tool_call_id = turn_id WHERE tool_call_id IS NULL"
-    )
+    if "ix_agent_action_log_tool_call_id" not in indexes:
+        op.create_index(
+            "ix_agent_action_log_tool_call_id",
+            "agent_action_log",
+            ["tool_call_id"],
+        )
+    if "ix_agent_action_log_chat_turn_id" not in indexes:
+        op.create_index(
+            "ix_agent_action_log_chat_turn_id",
+            "agent_action_log",
+            ["chat_turn_id"],
+        )
+
+    if "turn_id" in columns:
+        op.execute(
+            "UPDATE agent_action_log SET tool_call_id = turn_id WHERE tool_call_id IS NULL"
+        )


 def downgrade() -> None:
--- a/surfsense_backend/alembic/versions/136_new_chat_message_turn_id.py
+++ b/surfsense_backend/alembic/versions/136_new_chat_message_turn_id.py
@ -36,15 +36,22 @@ depends_on: str | Sequence[str] | None = None


 def upgrade() -> None:
-    op.add_column(
-        "new_chat_messages",
-        sa.Column("turn_id", sa.String(length=64), nullable=True),
-    )
-    op.create_index(
-        "ix_new_chat_messages_turn_id",
-        "new_chat_messages",
-        ["turn_id"],
-    )
+    bind = op.get_bind()
+    inspector = sa.inspect(bind)
+    columns = {c["name"] for c in inspector.get_columns("new_chat_messages")}
+    indexes = {i["name"] for i in inspector.get_indexes("new_chat_messages")}
+
+    if "turn_id" not in columns:
+        op.add_column(
+            "new_chat_messages",
+            sa.Column("turn_id", sa.String(length=64), nullable=True),
+        )
+    if "ix_new_chat_messages_turn_id" not in indexes:
+        op.create_index(
+            "ix_new_chat_messages_turn_id",
+            "new_chat_messages",
+            ["turn_id"],
+        )


 def downgrade() -> None:
--- a/surfsense_backend/alembic/versions/137_unique_reverse_of_in_action_log.py
+++ b/surfsense_backend/alembic/versions/137_unique_reverse_of_in_action_log.py
@ -27,6 +27,8 @@ from __future__ import annotations

 from collections.abc import Sequence

+import sqlalchemy as sa
+
 from alembic import op

 revision: str = "137"
@ -39,6 +41,11 @@ _INDEX_NAME = "ux_agent_action_log_reverse_of"


 def upgrade() -> None:
+    bind = op.get_bind()
+    indexes = {i["name"] for i in sa.inspect(bind).get_indexes("agent_action_log")}
+    if _INDEX_NAME in indexes:
+        return
+
    # Defensively de-dup any pre-existing double-revert rows before
    # adding the unique index. Keeps the OLDEST row (smallest id) and
    # NULLs out the duplicates' ``reverse_of`` so they survive as audit
--- a/surfsense_backend/alembic/versions/141_unique_chat_message_turn_role.py
+++ b/surfsense_backend/alembic/versions/141_unique_chat_message_turn_role.py
@ -53,6 +53,11 @@ TABLE_NAME = "new_chat_messages"


 def upgrade() -> None:
+    bind = op.get_bind()
+    indexes = {i["name"] for i in sa.inspect(bind).get_indexes(TABLE_NAME)}
+    if INDEX_NAME in indexes:
+        return
+
    op.create_index(
        INDEX_NAME,
        TABLE_NAME,
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -473,10 +473,16 @@ def initialize_vision_llm_router():
 class Config:
    # Check if ffmpeg is installed
    if not is_ffmpeg_installed():
-        import static_ffmpeg
+        allow_static_ffmpeg = (
+            os.getenv("SURFSENSE_ALLOW_STATIC_FFMPEG_DOWNLOAD", "TRUE").upper()
+            == "TRUE"
+        )
+        if allow_static_ffmpeg:
+            import static_ffmpeg
+
+            # ffmpeg installed on first call to add_paths(), threadsafe.
+            static_ffmpeg.add_paths()

-        # ffmpeg installed on first call to add_paths(), threadsafe.
-        static_ffmpeg.add_paths()
        # check if ffmpeg is installed again
        if not is_ffmpeg_installed():
            raise ValueError(
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -134,12 +134,92 @@ class EtlPipelineService:
        else:
            raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")

+        # When the operator opts into vision-LLM at ingest, walk the
+        # original file's embedded images and append a structured
+        # "Image Content" section. The parser's own OCR (Docling
+        # do_ocr=True, Azure DI prebuilt-read, etc.) handles text-in-
+        # image; this side handles the *visual* description which the
+        # parsers all drop today.
+        content = await self._maybe_append_picture_descriptions(request, content)
+
        return EtlResult(
            markdown_content=content,
            etl_service=etl_service,
            content_type="document",
        )

+    async def _maybe_append_picture_descriptions(
+        self, request: EtlRequest, markdown: str
+    ) -> str:
+        if self._vision_llm is None:
+            return markdown
+
+        from app.etl_pipeline.picture_describer import (
+            describe_pictures,
+            merge_descriptions_into_markdown,
+        )
+
+        # Per-image OCR runner: re-feed each extracted image through
+        # the ETL pipeline *as a standalone image* (no vision LLM, so
+        # the IMAGE branch falls through to the document parser, which
+        # OCRs the image with the configured backend -- Docling /
+        # Azure DI / LlamaCloud). This gives us per-image OCR text
+        # attached to the inline image block, in addition to the
+        # page-level OCR that the parser already merges into the main
+        # markdown stream. The fresh sub-service gets vision_llm=None
+        # so this call cannot recurse back into picture_describer.
+        async def _ocr_image(image_path: str, image_name: str) -> str:
+            try:
+                sub = EtlPipelineService(vision_llm=None)
+                ocr_result = await sub.extract(
+                    EtlRequest(file_path=image_path, filename=image_name)
+                )
+            except (
+                EtlUnsupportedFileError,
+                EtlServiceUnavailableError,
+            ) as exc:
+                # Common case: the configured ETL service can't OCR
+                # this image format (or no service is configured at
+                # all). Don't spam warnings -- just no OCR for it.
+                logging.debug(
+                    "Skipping per-image OCR for %s: %s", image_name, exc
+                )
+                return ""
+            return ocr_result.markdown_content
+
+        try:
+            result = await describe_pictures(
+                request.file_path,
+                request.filename,
+                self._vision_llm,
+                ocr_runner=_ocr_image,
+            )
+        except Exception:
+            # Picture description is additive; never let it fail an
+            # otherwise-successful document extraction.
+            logging.warning(
+                "Picture description failed for %s, returning parser output unchanged",
+                request.filename,
+                exc_info=True,
+            )
+            return markdown
+
+        if not result.descriptions:
+            return markdown
+
+        merged = merge_descriptions_into_markdown(markdown, result)
+        logging.info(
+            "Vision LLM described %d image(s) in %s "
+            "(skipped: %d small / %d large / %d duplicate, %d failed)",
+            len(result.descriptions),
+            request.filename,
+            result.skipped_too_small,
+            result.skipped_too_large,
+            result.skipped_duplicate,
+            result.failed,
+        )
+        return merged
+
    async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
        """Try Azure Document Intelligence first (when configured) then LlamaCloud.

--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@ -4,12 +4,34 @@ import os

 from langchain_core.messages import HumanMessage

+# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
+# A standalone image IS the document, so we want everything: visual
+# content plus any text the model can read off it. The output is
+# combined markdown that the chunker treats as the full document body.
 _PROMPT = (
    "Describe this image in markdown. "
    "Transcribe any visible text verbatim. "
    "Be concise but complete — let the image content guide the level of detail."
 )

+# Per-image-in-PDF prompt. Here the image is *inside* a larger
+# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
+# already running OCR over the whole page — including text rendered
+# into images. So we explicitly tell the model NOT to transcribe text
+# and to focus only on visual interpretation. This avoids paying
+# output tokens for OCR content the ETL pipeline already captured.
+_DESCRIPTION_PROMPT = (
+    "Describe what this image visually depicts in concise markdown. "
+    "Focus on visual content — anatomy, structures, charts, diagrams, "
+    "spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
+    "histology slide), and any clinically or structurally relevant "
+    "findings.\n\n"
+    "Do NOT transcribe text from the image. Any text in the image "
+    "(axis labels, annotations, scale bars, lab values, etc.) is "
+    "already extracted by a separate OCR pipeline; duplicating it "
+    "here would be redundant. Stick to the visual interpretation."
+)
+
 _MAX_IMAGE_BYTES = (
    5 * 1024 * 1024
 )  # 5 MB (Anthropic Claude's limit, the most restrictive)
@ -47,11 +69,10 @@ def _image_to_data_url(file_path: str) -> str:
    return f"data:{mime_type};base64,{encoded}"


-async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
-    data_url = _image_to_data_url(file_path)
+async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
    message = HumanMessage(
        content=[
-            {"type": "text", "text": _PROMPT},
+            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": data_url}},
        ]
    )
@ -62,3 +83,36 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
    if not text or not text.strip():
        raise ValueError(f"Vision LLM returned empty content for {filename}")
    return text.strip()
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    """Single-shot: returns combined markdown for a standalone image upload.
+
+    Used when the operator uploads an image file directly (jpg/png/etc).
+    The image is the document, so the prompt asks for both visual
+    description and verbatim text in one go.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _PROMPT, data_url, filename)
+
+
+async def parse_image_for_description(
+    file_path: str, filename: str, llm
+) -> str:
+    """Visual-description-only call for per-image-in-PDF use.
+
+    Used by ``picture_describer`` when an image is embedded inside a
+    larger document. Returns a markdown description of what the image
+    visually depicts; deliberately does NOT include text-in-image OCR
+    because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
+    already running OCR over the entire page and would duplicate that
+    text content.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
+
+
+__all__ = [
+    "parse_image_for_description",
+    "parse_with_vision_llm",
+]
--- a/surfsense_backend/app/etl_pipeline/picture_describer.py
+++ b/surfsense_backend/app/etl_pipeline/picture_describer.py
@ -0,0 +1,678 @@
+"""Extract embedded images from PDFs, describe them, and inject the
+descriptions inline into the parser's markdown.
+
+When the operator passes ``use_vision_llm=True`` for a PDF, the document
+parsers (DOCLING / LLAMACLOUD / Azure DI / UNSTRUCTURED) extract text
+but mostly drop the actual image content -- a CT scan inside a clinical
+PDF becomes (at best) a ``<!-- image -->`` placeholder in the markdown,
+and the caption text below it.
+
+This module fills that gap. After the document parser produces markdown
+text, we:
+
+1. Walk the original PDF with :mod:`pypdf`, pulling out each embedded
+   image (deduped by sha256, size-capped to match the vision LLM's own
+   limits).
+2. Run the vision LLM on each unique image (visual description) and,
+   in parallel when an OCR runner is provided, re-feed the same image
+   through the ETL service for per-image OCR.
+3. **Inject** a horizontal-rule-delimited markdown section -- with
+   named "OCR text" and "Visual description" sub-sections -- where the
+   image actually appears in the parser's markdown. Two splice modes,
+   chosen by which marker the parser emitted:
+
+   - **Replace** Docling-style ``<!-- image -->`` placeholders (and an
+     optional ``Image: <filename>`` caption line). The placeholder
+     carries no useful content of its own, so we substitute our block
+     for it.
+   - **Append after** layout-aware ``<figure>...</figure>`` blocks
+     (Azure DI ``prebuilt-layout``, LlamaCloud premium). Those blocks
+     already contain parser-extracted chart values / OCR'd labels /
+     captions, which are themselves useful for retrieval -- so we
+     PRESERVE the figure verbatim and add our vision-LLM block
+     immediately after it. The chunk then contains both the parser's
+     structured numbers AND the VLM's semantic interpretation.
+
+   Either way, the image content stays in context with the surrounding
+   document body rather than getting orphaned at the end -- crucial for
+   retrieval, where a single chunk should contain the question, the
+   image content, and the answer options together.
+
+If no placeholders, figures, or captions can be matched (e.g. an
+unusual parser output), we fall back to appending an
+``## Image Content`` section so no image content is silently lost.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import hashlib
+import logging
+import re
+import tempfile
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+# Type alias for the OCR callback. Takes (file_path, filename), returns
+# the OCR'd markdown text -- or empty string if no text was found, or
+# raises if OCR failed unrecoverably (which the describer catches and
+# treats as "no OCR for this image" rather than failing the whole doc).
+OcrRunner = Callable[[str, str], Awaitable[str]]
+
+logger = logging.getLogger(__name__)
+
+
+# Bound how many vision LLM calls we make in parallel for a single
+# document. Vision models are typically rate-limited; 4 concurrent
+# calls is a safe default that respects most provider limits while
+# keeping wall-clock manageable for image-heavy PDFs.
+_VISION_CONCURRENCY = 4
+
+# Match parse_with_vision_llm's per-image cap so we don't even attempt
+# images that the vision LLM would reject anyway (Anthropic's 5 MB
+# limit is the most restrictive among the major providers).
+_MAX_IMAGE_BYTES = 5 * 1024 * 1024
+
+# Skip degenerate images: tracking pixels, very small decorative dots,
+# scanner-introduced artefacts. We can't cheaply check pixel dimensions
+# without decoding the image, so we approximate: anything under 1 KB is
+# almost certainly not informative content.
+_MIN_IMAGE_BYTES = 1024
+
+
+@dataclass
+class PictureDescription:
+    """A single extracted image with its visual description and (optionally) OCR.
+
+    Two content fields by design, each produced by the *right* tool:
+
+    - ``description``: the vision LLM's visual interpretation. What the
+      image depicts (anatomy, charts, layout, etc.) -- the semantic
+      content that only a vision model can produce.
+    - ``ocr_text``: text-in-image extracted by re-feeding the image
+      through the configured ETL service (Docling/Azure DI/LlamaCloud)
+      *as if it were a standalone image upload*. Specialist OCR engine,
+      per-image attribution, no vision LLM tokens spent on text. None
+      when no OCR was requested or OCR found no text.
+    """
+
+    page_number: int                # 1-indexed
+    ordinal_in_page: int            # 0-indexed within the page
+    name: str                       # name pypdf assigned (e.g. "Im0")
+    sha256: str                     # hash of the raw image bytes
+    description: str                # visual description (markdown)
+    ocr_text: str | None = None     # OCR text from the ETL service, if any
+
+
+@dataclass
+class PictureExtractionResult:
+    """Aggregate result of extracting all pictures from a document."""
+
+    descriptions: list[PictureDescription] = field(default_factory=list)
+    skipped_too_small: int = 0
+    skipped_too_large: int = 0
+    skipped_duplicate: int = 0
+    failed: int = 0
+
+    @property
+    def has_content(self) -> bool:
+        return bool(self.descriptions)
+
+
+def _is_pdf(filename: str) -> bool:
+    return filename.lower().endswith(".pdf")
+
+
+def _pick_suffix(name: str) -> str:
+    lower = name.lower()
+    for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp"):
+        if lower.endswith(ext):
+            return ".jpeg" if ext == ".jpg" else ext
+    return ".png"
+
+
+def _extract_pdf_images(file_path: str) -> list[tuple[int, int, str, bytes]]:
+    """Pull every embedded image out of a PDF.
+
+    Returns ``(page_number_1_indexed, ordinal_in_page, name, bytes)``.
+    Per-page and per-image failures are logged and skipped -- one bad
+    image must not fail the whole document.
+    """
+
+    from pypdf import PdfReader
+
+    out: list[tuple[int, int, str, bytes]] = []
+    try:
+        reader = PdfReader(file_path)
+    except Exception:
+        logger.warning(
+            "pypdf failed to open %s for image extraction",
+            file_path,
+            exc_info=True,
+        )
+        return out
+
+    for page_idx, page in enumerate(reader.pages):
+        try:
+            images = list(page.images)
+        except Exception:
+            logger.warning(
+                "pypdf failed to enumerate images on page %d of %s",
+                page_idx + 1,
+                file_path,
+                exc_info=True,
+            )
+            continue
+        for img_idx, img in enumerate(images):
+            try:
+                name = getattr(img, "name", None) or f"page{page_idx + 1}_img{img_idx}"
+                data = img.data
+            except Exception:
+                logger.warning(
+                    "pypdf failed to read image %d on page %d of %s",
+                    img_idx,
+                    page_idx + 1,
+                    file_path,
+                    exc_info=True,
+                )
+                continue
+            out.append((page_idx + 1, img_idx, name, data))
+    return out
+
+
+async def _describe_one(
+    page_number: int,
+    ordinal: int,
+    name: str,
+    sha256: str,
+    data: bytes,
+    vision_llm: Any,
+    semaphore: asyncio.Semaphore,
+    ocr_runner: OcrRunner | None,
+) -> PictureDescription | None:
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    suffix = _pick_suffix(name)
+    # NamedTemporaryFile + delete=False because the vision-LLM helper
+    # and the OCR runner each open the path themselves; we clean up in
+    # the finally. Same temp file feeds both, which is correct: vision
+    # LLM and OCR are looking at the same image, just asking different
+    # questions of it.
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        tmp.write(data)
+        tmp_path = tmp.name
+    try:
+        async with semaphore:
+            tasks: list[Awaitable[Any]] = [
+                parse_image_for_description(tmp_path, name, vision_llm),
+            ]
+            if ocr_runner is not None:
+                tasks.append(ocr_runner(tmp_path, name))
+
+            # return_exceptions=True so a failure in one branch (most
+            # often OCR) doesn't poison the other.
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        description_result = results[0]
+        if isinstance(description_result, BaseException):
+            logger.warning(
+                "Vision LLM failed for image %s on page %d, skipping",
+                name,
+                page_number,
+                exc_info=description_result,
+            )
+            return None
+        description = str(description_result)
+
+        ocr_text: str | None = None
+        if ocr_runner is not None and len(results) > 1:
+            ocr_result = results[1]
+            if isinstance(ocr_result, BaseException):
+                logger.warning(
+                    "Per-image OCR failed for image %s on page %d, "
+                    "omitting OCR field for this image",
+                    name,
+                    page_number,
+                    exc_info=ocr_result,
+                )
+            else:
+                stripped = str(ocr_result).strip()
+                # Empty OCR (or whitespace-only) means the OCR engine
+                # found no text in this image. Record that as None so
+                # the rendered block doesn't include a useless empty tag.
+                ocr_text = stripped or None
+    finally:
+        with contextlib.suppress(OSError):
+            Path(tmp_path).unlink()
+
+    return PictureDescription(
+        page_number=page_number,
+        ordinal_in_page=ordinal,
+        name=name,
+        sha256=sha256,
+        description=description,
+        ocr_text=ocr_text,
+    )
+
+
+async def describe_pictures(
+    file_path: str,
+    filename: str,
+    vision_llm: Any,
+    *,
+    ocr_runner: OcrRunner | None = None,
+) -> PictureExtractionResult:
+    """Extract embedded images from a document and describe each via vision LLM.
+
+    When ``ocr_runner`` is provided, each image is also passed to it
+    (in parallel with the vision LLM) and the returned text is recorded
+    in :attr:`PictureDescription.ocr_text`. The runner is typically a
+    closure over a vision-LLM-less ``EtlPipelineService`` -- this lets
+    the same OCR engine that processes standalone image uploads
+    (Docling/Azure DI/LlamaCloud) also process embedded-in-PDF images,
+    giving per-image OCR attribution alongside the page-level OCR that
+    the parser already does.
+
+    Currently PDF-only. For non-PDF documents this returns an empty
+    result and the caller should leave the parser's markdown untouched.
+    """
+
+    result = PictureExtractionResult()
+    if not _is_pdf(filename) or vision_llm is None:
+        return result
+
+    raw_images = _extract_pdf_images(file_path)
+    if not raw_images:
+        return result
+
+    seen_hashes: set[str] = set()
+    eligible: list[tuple[int, int, str, str, bytes]] = []
+    for page_number, ordinal, name, data in raw_images:
+        if len(data) > _MAX_IMAGE_BYTES:
+            result.skipped_too_large += 1
+            continue
+        if len(data) < _MIN_IMAGE_BYTES:
+            result.skipped_too_small += 1
+            continue
+        sha = hashlib.sha256(data).hexdigest()
+        if sha in seen_hashes:
+            result.skipped_duplicate += 1
+            continue
+        seen_hashes.add(sha)
+        eligible.append((page_number, ordinal, name, sha, data))
+
+    if not eligible:
+        return result
+
+    semaphore = asyncio.Semaphore(_VISION_CONCURRENCY)
+    tasks = [
+        _describe_one(p, o, n, sha, d, vision_llm, semaphore, ocr_runner)
+        for (p, o, n, sha, d) in eligible
+    ]
+    descriptions = await asyncio.gather(*tasks)
+    for desc in descriptions:
+        if desc is None:
+            result.failed += 1
+        else:
+            result.descriptions.append(desc)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Rendering: build the per-image markdown block + inject inline.
+# ---------------------------------------------------------------------------
+
+
+def _format_image_block(
+    name: str,
+    description: str,
+    ocr_text: str | None = None,
+) -> str:
+    """Render the per-image block as a horizontal-rule-delimited section.
+
+    Why no blockquote / no raw HTML / no XML?
+    -----------------------------------------
+    We tried each in turn and each failed in the document viewer:
+
+    - **Raw HTML / XML** (``<image>...</image>``): unknown elements
+      have no render rules in Streamdown or PlateJS, so the content
+      survives in the markdown source but is invisible to humans.
+    - **Blockquote with nested blocks**: nested fenced code blocks,
+      bullet lists, numbered lists, tables -- any *block* element
+      inside a ``>``-prefixed blockquote -- gets evicted by Streamdown
+      / remark, dropping everything after it onto the document level.
+      The vision LLM happily produces bulleted descriptions, so this
+      hit the viewer in practice.
+
+    A horizontal-rule-delimited section, by contrast, contains only
+    standard top-level markdown -- bold labels and free-form body --
+    so the description's native markdown (lists, prose, tables) all
+    renders natively in every renderer.
+
+    Layout (OCR section omitted when ``ocr_text`` is None/empty):
+
+        ---
+
+        **Embedded image:** `MM-130-a.jpeg`
+
+        **OCR text:**
+        Slice 24 / 60
+        L
+        R
+
+        **Visual description:**
+
+        - Axial contrast-enhanced CT showing a large cystic mass...
+        - Mass effect on the adjacent stomach.
+
+        ---
+
+    Still LLM-friendly: the ``**Embedded image:** `<filename>``` prefix
+    is unique and trivially regex-able (``^\\*\\*Embedded image:\\*\\* `(.+?)`$``).
+
+    Returned with leading and trailing blank-line padding so the rules
+    never merge with adjacent paragraphs after splicing.
+    """
+
+    parts: list[str] = [f"**Embedded image:** `{name}`"]
+
+    if ocr_text and ocr_text.strip():
+        # Bold "OCR text:" label with trailing two spaces (=> <br>) so
+        # the first OCR line sits directly under the label rather than
+        # forcing a paragraph break that some renderers would style
+        # differently. Subsequent OCR lines also use trailing two spaces
+        # for hard breaks, so multi-line OCR renders line-by-line
+        # without needing a (fragile) fenced code block.
+        ocr_clean_lines = [
+            ln.rstrip() for ln in ocr_text.strip().splitlines() if ln.strip()
+        ]
+        parts.append("")
+        parts.append("**OCR text:**  ")
+        for i, raw in enumerate(ocr_clean_lines):
+            suffix = "" if i == len(ocr_clean_lines) - 1 else "  "
+            parts.append(f"{raw}{suffix}")
+
+    parts.append("")
+    parts.append("**Visual description:**")
+    parts.append("")
+    parts.append(description.strip())
+
+    body = "\n".join(parts)
+    # Wrap with blank lines + horizontal rules so the block is clearly
+    # delimited from surrounding paragraphs and survives splicing into
+    # the middle of any markdown stream.
+    return "\n\n---\n\n" + body + "\n\n---\n\n"
+
+
+# Patterns we'll try to splice into. Each pattern captures the
+# original-PDF filename when one is available (group 1).
+#
+# Replace-style markers (the matched span is substituted with our block
+# because it carries no useful content of its own):
+#
+# 1. Docling's image placeholder followed by an "Image: <filename>"
+#    caption line. This is what our medxpertqa renderer produces:
+#    reportlab places the JPEG, then a caption, and Docling outputs
+#    the placeholder + caption.
+# 2. Docling's image placeholder alone (filename unknown -- we fall
+#    back to pypdf's name).
+# 3. A bare "Image: <filename>" caption line with no preceding
+#    placeholder. Rare in practice, but covers parsers that drop the
+#    placeholder entirely.
+_PLACEHOLDER_WITH_CAPTION = re.compile(
+    r"<!--\s*image\s*-->\s*\n\s*Image:\s*(\S+)\s*(?:\n|$)",
+    re.IGNORECASE,
+)
+_PLACEHOLDER_ONLY = re.compile(
+    r"<!--\s*image\s*-->",
+    re.IGNORECASE,
+)
+_CAPTION_ONLY = re.compile(
+    r"^[ \t]*Image:\s*(\S+)\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+# Append-after marker (the matched span is preserved verbatim and our
+# block is inserted immediately after it):
+#
+# 4. ``<figure>...</figure>`` as emitted by layout-aware parsers (Azure
+#    Document Intelligence ``prebuilt-layout``, LlamaCloud premium).
+#    The figure's own contents -- chart bar values, axis labels,
+#    inline ``<figcaption>``, embedded ``<table>`` for tabular figures
+#    -- are themselves specialist OCR output, so we keep them and add
+#    our vision-LLM block alongside. ``[^>]*`` in the open tag tolerates
+#    optional attributes like ``<figure id="...">``; ``re.DOTALL``
+#    lets ``.`` cross the newlines inside the block.
+_FIGURE_BLOCK = re.compile(
+    r"<figure\b[^>]*>.*?</figure>",
+    re.DOTALL | re.IGNORECASE,
+)
+
+
+def _replace_one_match(
+    markdown: str,
+    pattern: re.Pattern[str],
+    descriptions: list[PictureDescription],
+    desc_idx: int,
+) -> tuple[str, int]:
+    """Replace the first occurrence of ``pattern`` with the next image block.
+
+    Returns the new markdown and the new ``desc_idx`` (advanced if a
+    replacement happened, unchanged otherwise).
+    """
+
+    if desc_idx >= len(descriptions):
+        return markdown, desc_idx
+
+    match = pattern.search(markdown)
+    if not match:
+        return markdown, desc_idx
+
+    desc = descriptions[desc_idx]
+    captured_name: str | None = None
+    if match.groups():
+        captured_name = match.group(1)
+    name = captured_name or desc.name
+    block = _format_image_block(name, desc.description, desc.ocr_text)
+
+    new_markdown = markdown[: match.start()] + block + markdown[match.end():]
+    return new_markdown, desc_idx + 1
+
+
+def _splice_after_figures(
+    markdown: str,
+    descriptions: list[PictureDescription],
+    desc_idx: int,
+) -> tuple[str, int]:
+    """Append vision-LLM blocks immediately after each ``<figure>...</figure>``.
+
+    Layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
+    premium) wrap each figure / chart / inline table in this tag and
+    carry their own OCR of the figure's text content inside it. That
+    content is useful on its own, so we keep the original block
+    verbatim and add our vision-LLM block right after it -- giving
+    retrieval both signals in the same chunk.
+
+    Descriptions are matched to figures in document order (first
+    description -> first figure, etc.). All splice points are computed
+    upfront with :func:`re.finditer` and applied in REVERSE order so
+    earlier offsets stay valid as the markdown grows. Returns the
+    advanced ``desc_idx`` for the caller's leftover-handling.
+    """
+
+    if desc_idx >= len(descriptions):
+        return markdown, desc_idx
+
+    matches = list(_FIGURE_BLOCK.finditer(markdown))
+    if not matches:
+        return markdown, desc_idx
+
+    n_to_splice = min(len(matches), len(descriptions) - desc_idx)
+    if n_to_splice <= 0:
+        return markdown, desc_idx
+
+    out = markdown
+    # Walk in reverse so each splice's end-offset still points at the
+    # right place in the (still-mutating) string.
+    for i in range(n_to_splice - 1, -1, -1):
+        match = matches[i]
+        desc = descriptions[desc_idx + i]
+        block = _format_image_block(desc.name, desc.description, desc.ocr_text)
+        out = out[: match.end()] + block + out[match.end():]
+
+    return out, desc_idx + n_to_splice
+
+
+def inject_descriptions_inline(
+    markdown: str,
+    result: PictureExtractionResult,
+) -> tuple[str, int]:
+    """Splice per-image markdown blocks into the document at image positions.
+
+    Walks the markdown left-to-right, consuming descriptions in order.
+    Tries two splicing strategies, in this order:
+
+    1. **Append-after** for ``<figure>...</figure>`` blocks emitted by
+       layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
+       premium). The figure block carries the parser's own OCR of the
+       figure -- we preserve it and add our vision-LLM block right
+       after.
+    2. **Replace** for Docling-style markers, in priority order:
+
+       - ``<!-- image -->`` followed by ``Image: <filename>`` caption,
+       - ``<!-- image -->`` placeholder alone,
+       - bare ``Image: <filename>`` caption.
+
+    A document typically uses one style or the other (depending on
+    which parser produced its markdown), so the two paths don't fight
+    each other in practice. When they do co-occur, figures are
+    consumed first.
+
+    Returns ``(new_markdown, n_inlined)`` -- the count of descriptions
+    that were placed inline. The caller decides what to do with any
+    leftover descriptions (typically: append them at the end).
+    """
+
+    if not result.descriptions:
+        return markdown, 0
+
+    descriptions = result.descriptions
+    desc_idx = 0
+    out = markdown
+
+    # Step 1: layout-aware figures. One-shot batch -- finds ALL
+    # <figure> blocks, splices in document order until we exhaust
+    # either side.
+    out, desc_idx = _splice_after_figures(out, descriptions, desc_idx)
+
+    # Step 2: Docling-style replacement markers. One match per
+    # iteration, so a doc that has both a figure (consumed above) and
+    # a Docling placeholder (consumed below) still works.
+    while desc_idx < len(descriptions):
+        before_idx = desc_idx
+        out, desc_idx = _replace_one_match(
+            out, _PLACEHOLDER_WITH_CAPTION, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        out, desc_idx = _replace_one_match(
+            out, _PLACEHOLDER_ONLY, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        out, desc_idx = _replace_one_match(
+            out, _CAPTION_ONLY, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        # No more positions to splice into.
+        break
+
+    return out, desc_idx
+
+
+def render_appended_section(
+    descriptions: list[PictureDescription],
+    *,
+    skip_notes: PictureExtractionResult | None = None,
+    heading: str = "## Image Content (vision-LLM extracted)",
+) -> str:
+    """Render leftover descriptions as an appended section.
+
+    Used as a fallback when not every description could be inlined
+    (either because the parser produced no detectable image markers,
+    or because there were more extracted images than markers).
+    """
+
+    if not descriptions and not skip_notes:
+        return ""
+
+    parts: list[str] = ["", heading, ""]
+    for desc in descriptions:
+        parts.append(
+            _format_image_block(desc.name, desc.description, desc.ocr_text)
+        )
+        parts.append("")
+
+    if skip_notes is not None:
+        notes: list[str] = []
+        if skip_notes.skipped_too_large:
+            notes.append(f"{skip_notes.skipped_too_large} too large (> 5 MB)")
+        if skip_notes.skipped_too_small:
+            notes.append(f"{skip_notes.skipped_too_small} too small (< 1 KB)")
+        if skip_notes.skipped_duplicate:
+            notes.append(f"{skip_notes.skipped_duplicate} duplicate")
+        if skip_notes.failed:
+            notes.append(f"{skip_notes.failed} failed")
+        if notes:
+            parts.append(f"_Note: {', '.join(notes)} image(s) skipped._")
+
+    return "\n".join(parts)
+
+
+def merge_descriptions_into_markdown(
+    markdown: str,
+    result: PictureExtractionResult,
+) -> str:
+    """Top-level: inline what we can, append what's left over.
+
+    This is the function the ETL pipeline actually calls. It guarantees
+    that no successfully-described image is silently dropped: anything
+    we can't splice inline gets appended at the end with a heading
+    that makes it clear those came from the document but weren't
+    location-matched.
+    """
+
+    if not result.descriptions:
+        return markdown
+
+    new_markdown, n_inlined = inject_descriptions_inline(markdown, result)
+    leftover = result.descriptions[n_inlined:]
+
+    if not leftover:
+        return new_markdown
+
+    # Distinguish in the heading whether NONE were inlined (parser
+    # produced no markers at all) vs SOME (mismatched count).
+    heading = (
+        "## Image Content (vision-LLM extracted)"
+        if n_inlined == 0
+        else "## Image Content (additional, no inline marker found)"
+    )
+    section = render_appended_section(leftover, heading=heading)
+    if not section:
+        return new_markdown
+    return f"{new_markdown.rstrip()}\n\n{section.lstrip()}\n"
+
+
+__all__ = [
+    "PictureDescription",
+    "PictureExtractionResult",
+    "describe_pictures",
+    "inject_descriptions_inline",
+    "merge_descriptions_into_markdown",
+    "render_appended_section",
+]
--- a/surfsense_backend/app/services/docling_service.py
+++ b/surfsense_backend/app/services/docling_service.py
@ -77,10 +77,16 @@ class DoclingService:
            # Create pipeline options with version-safe attribute checking
            pipeline_options = PdfPipelineOptions()

-            # Disable OCR (user request)
+            # Enable OCR so text-in-image (chart axes, ECG annotations,
+            # lab tables embedded as images, scanned pages, etc.) is
+            # lifted into the main markdown stream. This pairs with the
+            # vision-LLM picture-description pass downstream — OCR
+            # captures literal text; vision LLM captures the visual
+            # content. Together they give a faithful representation of
+            # PDFs that mix text and images.
            if hasattr(pipeline_options, "do_ocr"):
-                pipeline_options.do_ocr = False
-                logger.info("⚠️ OCR disabled by user request")
+                pipeline_options.do_ocr = True
+                logger.info("✅ OCR enabled for embedded text-in-image extraction")
            else:
                logger.warning("⚠️ OCR attribute not available in this Docling version")

--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -123,10 +123,6 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
    """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    from app.etl_pipeline.file_classifier import (
-        FileCategory,
-        classify_file as etl_classify,
-    )

    await _notify(ctx, "parsing", "Processing file")
    await ctx.task_logger.log_task_progress(
@ -135,8 +131,12 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
        {"processing_stage": "extracting"},
    )

+    # Fetch the vision LLM whenever the operator opts in. The ETL
+    # pipeline decides what to do with it: image files run through the
+    # vision LLM directly; document files (PDFs) get per-image
+    # descriptions appended via picture_describer.
    vision_llm = None
-    if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
+    if ctx.use_vision_llm:
        from app.services.llm_service import get_vision_llm

        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
@ -230,7 +230,16 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:

    await _notify(ctx, "parsing", "Extracting content")

-    etl_result = await EtlPipelineService().extract(
+    # Document files (PDF, docx, etc.) get vision LLM treatment too:
+    # the ETL pipeline appends a per-image description section when
+    # vision_llm is provided. See picture_describer.describe_pictures.
+    vision_llm = None
+    if ctx.use_vision_llm:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
+
+    etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(
            file_path=ctx.file_path,
            filename=ctx.filename,
@ -418,8 +427,12 @@ async def _extract_file_content(
        billable_pages = estimated_pages * mode.page_multiplier
        await page_limit_service.check_page_limit(user_id, billable_pages)

+    # Vision LLM is provided to the ETL pipeline for any file category
+    # when the operator opts in. Image files run through it directly;
+    # document files (PDFs) get per-image descriptions appended via
+    # picture_describer.
    vision_llm = None
-    if use_vision_llm and category == FileCategory.IMAGE:
+    if use_vision_llm:
        from app.services.llm_service import get_vision_llm

        vision_llm = await get_vision_llm(session, search_space_id)
--- a/surfsense_backend/scripts/docker/entrypoint.e2e.sh
+++ b/surfsense_backend/scripts/docker/entrypoint.e2e.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+# =============================================================================
+# E2E entrypoint for the multi-stage Dockerfile's `e2e` target.
+#
+# Dispatches on SERVICE_ROLE to the test-only entrypoints under tests/e2e/.
+# Those scripts apply sys.modules hijacks and LLM/embedding patches BEFORE
+# importing production app code (see tests/e2e/run_backend.py for rationale).
+#
+# Production never sees this file: tests/ is excluded from the production
+# stage, and the production stage uses scripts/docker/entrypoint.sh.
+# =============================================================================
+set -euo pipefail
+
+SERVICE_ROLE="${SERVICE_ROLE:-api}"
+echo "[e2e-entrypoint] starting role=${SERVICE_ROLE}"
+
+wait_for_db() {
+    # Block until the database is reachable. We don't loop forever — Compose
+    # depends_on/healthchecks already gate on db readiness, this is just
+    # belt-and-suspenders so a slow first connection doesn't race migrations.
+    for i in {1..60}; do
+        echo "[e2e-entrypoint] db check attempt ${i}/60"
+        if python -c "from app.db import engine; import asyncio; asyncio.run(engine.dispose())"; then
+            echo "[e2e-entrypoint] db reachable after ${i} attempts"
+            return 0
+        fi
+        sleep 1
+    done
+    echo "[e2e-entrypoint] ERROR: db not reachable after 60s" >&2
+    return 1
+}
+
+case "${SERVICE_ROLE}" in
+    api)
+        wait_for_db
+        echo "[e2e-entrypoint] running alembic upgrade head"
+        alembic upgrade head
+        # `exec` so SIGTERM from `docker stop` reaches Python directly,
+        # without a shell wrapper interposing.
+        exec python tests/e2e/run_backend.py
+        ;;
+    worker)
+        # Worker doesn't run migrations — the api role does that exactly once.
+        # We still wait for db so Celery's broker connection check doesn't
+        # race against an unready Postgres on cold start.
+        wait_for_db
+        exec python tests/e2e/run_celery.py
+        ;;
+    *)
+        echo "[e2e-entrypoint] ERROR: unknown SERVICE_ROLE='${SERVICE_ROLE}' (expected: api | worker)" >&2
+        exit 1
+        ;;
+esac
--- a/surfsense_backend/tests/e2e/README.md
+++ b/surfsense_backend/tests/e2e/README.md
@ -1,48 +1,48 @@
-# Backend E2E Test Harness
+# Backend E2E Harness

-Strict fakes + alternative entrypoints used **only** by Playwright E2E.
-Excluded from the production Docker image via `.dockerignore`.
+This directory contains the test-only backend entrypoints and fakes used by
+Playwright. They are not part of the production image: `.dockerignore` excludes
+`tests/`, and the E2E Docker stage copies this directory through a separate
+build context.

 ## Files

-| Path                             | Role                                                                            |
-| -------------------------------- | ------------------------------------------------------------------------------- |
-| `run_backend.py`                 | FastAPI entrypoint that hijacks `sys.modules` before importing `app.app:app`    |
-| `run_celery.py`                  | Celery worker entrypoint with the same hijack + patch logic                     |
-| `middleware/scenario.py`         | `X-E2E-Scenario` header → ContextVar (read by fakes)                            |
-| `fakes/composio_module.py`       | Strict drop-in for the `composio` package; raises on unknown surface            |
-| `fakes/llm.py`                   | `fake_get_user_long_context_llm` returning a `FakeListChatModel`                |
-| `fakes/embeddings.py`            | Deterministic 0.1-vector `embed_text` / `embed_texts`                           |
-| `fakes/fixtures/drive_files.json`| Canned Drive listings + file contents (incl. canary tokens)                     |
+| Path | Purpose |
+| --- | --- |
+| `run_backend.py` | Starts FastAPI after installing the test fakes into `sys.modules`. |
+| `run_celery.py` | Starts the Celery worker with the same fake setup. |
+| `middleware/scenario.py` | Reads `X-E2E-Scenario` into a request-scoped context var. |
+| `fakes/composio_module.py` | Fake `composio` package used by connector flows. |
+| `fakes/llm.py` | Fake chat model factory. |
+| `fakes/embeddings.py` | Deterministic embedding helpers. |
+| `fakes/fixtures/drive_files.json` | Drive fixture data and canary file contents. |

-## Why a sys.modules hijack?
+## Why the import hook exists

-Production code does `from composio import Composio` at module load
-time. By the time the FastAPI app object exists, that binding has
-already been resolved. The hijack runs **before** any `app.*` import,
-so the binding resolves to our strict fake. No production source
-changes; fakes are physically excluded from production images.
+Some production modules import SDK clients at module load time, for example
+`from composio import Composio`. By the time `app.app` has been imported, those
+bindings are already fixed.

-Belt + suspenders + no internet: the strict `__getattr__` in every
-fake raises `NotImplementedError` if a future production code path
-introduces a new SDK call. CI also sets `HTTPS_PROXY=http://127.0.0.1:1`
-plus sentinel API keys so any leaked outbound HTTP fails immediately.
+The E2E entrypoints install fake modules in `sys.modules` before importing any
+`app.*` module. That lets the normal production code run while SDK calls resolve
+to local fakes.

-## Adding a new fake
+The fakes should fail loudly. If production starts using a new SDK method that
+the fake does not implement, add that method to the fake instead of letting the
+test call the real service.

-1. Create `fakes/<sdk>_module.py` modelled on `composio_module.py`.
-2. In `run_backend.py` and `run_celery.py`, register
-   `sys.modules["<sdk>"] = _fake_<sdk>` before the `from app.app import app`
-   line.
-3. If the new fake needs scenario branching, read from
+## Adding a fake
+
+1. Add `fakes/<sdk>_module.py`.
+2. Register it in both `run_backend.py` and `run_celery.py` before importing
+   `app.app` or `app.celery_app`.
+3. If the fake needs per-test behavior, read the current scenario from
   `tests.e2e.middleware.scenario.current_scenario()`.

-## Reused by backend integration tests
+## Shared with backend integration tests

-The strict fakes are not only for Playwright. Backend route integration
-tests can import the same fake before importing `app.app`, so Composio
-route tests exercise production route code without touching the real
-SDK:
+Backend integration tests can use the same fakes when they need production route
+code without the real SDK:

 ```python
 from tests.e2e.fakes import composio_module as _fake_composio
@ -50,20 +50,93 @@ sys.modules["composio"] = _fake_composio
 from app.app import app
 ```

-See `surfsense_backend/tests/integration/composio/conftest.py` for the
-current pattern.
+See `surfsense_backend/tests/integration/composio/conftest.py` for the current
+pattern.

 ## Running locally

+The recommended local flow runs only Postgres and Redis in Docker, and the
+backend + Celery worker on the host. No `.env` file is required: both
+entrypoints `setdefault` every variable they need (DB URL, Redis URL,
+sentinel API keys, etc.) to values that match `docker-compose.deps-only.yml`.
+
+### One-time setup
+
+From `surfsense_web/`:
+
 ```bash
-cd surfsense_backend
+pnpm install
+pnpm exec playwright install --with-deps chromium
+```
+
+### Each run
+
+**1. Bring up Postgres + Redis** from the repo root (the other deps-only
+services (SearXNG, Zero, pgAdmin) are not needed for E2E):
+
+```bash
+docker compose -f docker/docker-compose.deps-only.yml up -d db redis
+```
+
+**2. Start the backend** in `surfsense_backend/`, terminal A:
+
+```bash
+uv sync
+uv run alembic upgrade head
 uv run python tests/e2e/run_backend.py
-# in a second shell:
+```
+
+**3. Start the Celery worker** in `surfsense_backend/`, terminal B:
+
+```bash
 uv run python tests/e2e/run_celery.py
 ```

-Then in `surfsense_web`:
+**4. Register the Playwright user**:

 ```bash
-pnpm test:e2e
+curl -X POST http://localhost:8000/auth/register \
+  -H "Content-Type: application/json" \
+  -d '{"email":"e2e-test@surfsense.net","password":"E2eTestPassword123!"}'
 ```
+
+**5. Run Playwright** from `surfsense_web/`, terminal C:
+
+```bash
+pnpm test:e2e             # dev server (fast iteration)
+pnpm test:e2e:headed      # show the browser
+pnpm test:e2e:ui          # Playwright UI mode
+pnpm test:e2e:prod        # build + start (matches CI exactly)
+```
+
+`playwright.config.ts` and the run scripts share defaults, so this works on a
+fresh checkout. Set `PLAYWRIGHT_TEST_EMAIL`, `PLAYWRIGHT_TEST_PASSWORD`,
+`NEXT_PUBLIC_FASTAPI_BACKEND_URL`, or any backend env (e.g. `DATABASE_URL`)
+only when pointing tests at a different stack.
+
+### Cleanup
+
+```bash
+docker compose -f docker/docker-compose.deps-only.yml down
+```
+
+Add `-v` to also wipe the Postgres volume.
+
+### Hermetic alternative (matches CI)
+
+To reproduce the CI environment exactly — backend and Celery in containers,
+network egress denied at L3 — replace steps 1–3 with:
+
+```bash
+docker compose -f docker/docker-compose.e2e.yml up -d --build --wait
+```
+
+Then run steps 4 (curl register) and 5 (`pnpm test:e2e:prod`) as above. Tear
+down with:
+
+```bash
+docker compose -f docker/docker-compose.e2e.yml down -v --remove-orphans
+```
+
+This builds the ~9 GB `surfsense-e2e-backend:local` image, so the deps-only
+flow above is faster for day-to-day development.
--- a/surfsense_backend/tests/e2e/auth_mint.py
+++ b/surfsense_backend/tests/e2e/auth_mint.py
@ -0,0 +1,66 @@
+"""Test-only token mint endpoint for the E2E backend entrypoint.
+
+Mounted by ``tests/e2e/run_backend.py`` so Playwright can authenticate
+the seeded e2e user without hitting ``/auth/jwt/login`` (rate-limited
+to 5/min/IP in production). NEVER ships to production: this whole
+``tests/`` tree is excluded from the production Docker image by
+``surfsense_backend/.dockerignore``.
+
+Authn: shared secret in ``X-E2E-Mint-Secret``. Same value is set on the
+backend container env (``docker/docker-compose.e2e.yml``) and exported
+to the Playwright runner (``.github/workflows/e2e-tests.yml``).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+from fastapi import APIRouter, FastAPI, Header, HTTPException
+from pydantic import BaseModel
+from sqlalchemy import select
+
+from app.db import User, async_session_maker
+from app.users import get_jwt_strategy
+
+_logger = logging.getLogger("surfsense.e2e.auth_mint")
+
+
+class MintRequest(BaseModel):
+    email: str = "e2e-test@surfsense.net"
+
+
+class MintResponse(BaseModel):
+    access_token: str
+    token_type: str = "bearer"
+
+
+def _expected_secret() -> str:
+    return os.environ.get("E2E_MINT_SECRET", "local-e2e-mint-secret-not-for-production")
+
+
+router = APIRouter(prefix="/__e2e__", tags=["__e2e__"])
+
+
+@router.post("/auth/token", response_model=MintResponse)
+async def mint_test_token(
+    body: MintRequest,
+    x_e2e_mint_secret: str = Header(..., alias="X-E2E-Mint-Secret"),
+) -> MintResponse:
+    if x_e2e_mint_secret != _expected_secret():
+        raise HTTPException(status_code=403, detail="invalid e2e mint secret")
+    async with async_session_maker() as session:
+        result = await session.execute(select(User).where(User.email == body.email))
+        user = result.scalar_one_or_none()
+    if user is None:
+        raise HTTPException(
+            status_code=404, detail=f"e2e user {body.email!r} not seeded"
+        )
+    token = await get_jwt_strategy().write_token(user)
+    return MintResponse(access_token=token)
+
+
+def install(app: FastAPI) -> None:
+    """Mount the test-only mint router onto the given FastAPI app."""
+    app.include_router(router)
+    _logger.warning("[e2e] mounted POST /__e2e__/auth/token (test-only token mint)")
--- a/surfsense_backend/tests/e2e/fakes/docling_service.py
+++ b/surfsense_backend/tests/e2e/fakes/docling_service.py
@ -0,0 +1,141 @@
+"""Stub DoclingService.process_document for E2E.
+
+The real ``DoclingService.process_document`` calls
+``DocumentConverter.convert(file_path)`` which lazily downloads the
+``docling-project/docling-layout-heron`` model from Hugging Face Hub.
+The hermetic E2E container sets ``HF_HUB_OFFLINE=1`` (see
+``docker/docker-compose.e2e.yml``), so that download fails with
+``LocalEntryNotFoundError`` and the indexing Celery task retries until
+the Playwright test hits its ~4-minute step timeout. In CI that is the
+difference between the suite finishing and the 30-minute job timeout
+killing the run before any report can upload.
+
+Stubbing ``process_document`` bypasses ``DocumentConverter.convert()``
+entirely. ``DoclingService.__init__`` is intentionally left untouched
+because constructing ``DocumentConverter(...)`` is cheap and offline —
+it is only ``.convert()`` that triggers the offline-model download.
+
+Every canary PDF under ``tests/e2e/fakes/fixtures/binary/`` is produced
+by ``generate_canary_pdfs.py`` and embeds its canary token as plain
+``(text) Tj`` PDF text operators. Extracting those operators gives us
+the canary string back, which is what the Playwright assertions look
+for in the resulting Document row.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Matches the `(escaped text) Tj` text-show operator emitted by
+# generate_canary_pdfs.py. Inside the parens, the escape rules are:
+#   \\  -> backslash
+#   \(  -> literal (
+#   \)  -> literal )
+# The character class [^\\()] consumes any non-escape byte; \\. consumes
+# an escape sequence. Sufficient for our synthetic fixtures.
+_TJ_PATTERN = re.compile(rb"\(((?:[^\\()]|\\.)*)\)\s*Tj")
+
+
+def _extract_text_from_synthetic_pdf(file_path: str) -> str:
+    """Pull every ``(text) Tj`` payload out of a fixture PDF in order.
+
+    Returns an empty string if the file cannot be read. We do not try to
+    handle arbitrary PDFs because the fake is only ever invoked against
+    fixtures we generate ourselves.
+    """
+    try:
+        data = Path(file_path).read_bytes()
+    except OSError as exc:
+        logger.warning("[fake-docling] could not read %s: %s", file_path, exc)
+        return ""
+
+    lines: list[str] = []
+    for match in _TJ_PATTERN.finditer(data):
+        raw = match.group(1)
+        # Order-sensitive unescape via sentinel: protect `\\` first so
+        # the subsequent `\(` / `\)` passes do not corrupt it.
+        text = (
+            raw.replace(rb"\\", b"\x00")
+            .replace(rb"\(", b"(")
+            .replace(rb"\)", b")")
+            .replace(b"\x00", b"\\")
+        )
+        try:
+            lines.append(text.decode("utf-8"))
+        except UnicodeDecodeError:
+            lines.append(text.decode("latin-1"))
+    return "\n".join(lines)
+
+
+async def fake_process_document(
+    self,
+    file_path: str,
+    filename: str | None = None,
+) -> dict[str, Any]:
+    """Drop-in replacement for ``DoclingService.process_document``.
+
+    Returns the same dict shape as the production method so callers
+    (``app/etl_pipeline/parsers/docling.py``) can keep reading
+    ``result["content"]`` without changes.
+    """
+    extracted = _extract_text_from_synthetic_pdf(file_path)
+    display_name = filename or Path(file_path).name
+
+    if extracted:
+        content = f"# {display_name}\n\n{extracted}\n"
+    else:
+        # Empty fallback so the indexing pipeline does not error out on
+        # an unexpected payload. A failing canary assertion is a much
+        # clearer failure mode than a hard parser exception.
+        content = (
+            f"# {display_name}\n\n(empty docling fake — no text-show operators found)\n"
+        )
+
+    logger.info(
+        "[fake-docling] returning %d chars for %s",
+        len(content),
+        display_name,
+    )
+
+    return {
+        "content": content,
+        "full_text": content,
+        "service_used": "docling-fake",
+        "status": "success",
+        "processing_notes": "e2e fake DoclingService — no real PDF parsing",
+    }
+
+
+def install(patches: list[Any]) -> None:
+    """Patch ``DoclingService.process_document`` at the class level.
+
+    Patching the class method (rather than each call site) is correct
+    here because every consumer goes through
+    ``create_docling_service()`` → ``DoclingService()`` → instance method
+    dispatch, so the descriptor protocol picks up our replacement. There
+    is exactly one such consumer today
+    (``app/etl_pipeline/parsers/docling.py``), but patching the class is
+    future-proof.
+
+    Fails loud rather than warning, because a silent passthrough means
+    real Docling + ``HF_HUB_OFFLINE=1`` = 4 minutes of CI hang per test.
+    """
+    from unittest.mock import patch as _patch
+
+    target = "app.services.docling_service.DoclingService.process_document"
+    try:
+        p = _patch(target, fake_process_document)
+        p.start()
+        patches.append(p)
+        logger.info("[fake-docling] patched %s", target)
+    except (ModuleNotFoundError, AttributeError) as exc:
+        raise RuntimeError(
+            f"Could not patch Docling binding {target!r}: {exc!s}. "
+            f"Update surfsense_backend/tests/e2e/fakes/docling_service.py "
+            f"to point at the new binding site."
+        ) from exc
--- a/surfsense_backend/tests/e2e/fixtures/global_llm_config.yaml
+++ b/surfsense_backend/tests/e2e/fixtures/global_llm_config.yaml
@ -0,0 +1,71 @@
+# Synthetic Global LLM configuration for E2E ONLY.
+#
+# Why this file exists:
+#   surfsense_backend/app/config/global_llm_config.yaml is gitignored
+#   (operators ship real API keys there). In CI that file does not exist,
+#   so app.config.load_global_llm_configs() returns [], every chat-stream
+#   test fails fast with "No usable global LLM configs are available for
+#   Auto mode" raised by auto_model_pin_service._global_candidates().
+#
+# What this file does:
+#   tests/e2e/run_backend.py and tests/e2e/run_celery.py copy this file
+#   to app/config/global_llm_config.yaml at startup, BEFORE app.config
+#   is imported. The copy lives only inside the E2E Docker container.
+#
+# Why a fake api_key is safe:
+#   tests.e2e.fakes.chat_llm patches
+#     app.tasks.chat.stream_new_chat.create_chat_litellm_from_agent_config
+#     app.tasks.chat.stream_new_chat.create_chat_litellm_from_config
+#   so the resolved auto-pin id is never sent to a real LLM provider.
+#   The values below only need to pass
+#   auto_model_pin_service._is_usable_global_config()
+#   which requires id / model_name / provider / api_key all truthy.
+#
+# Why TWO entries (premium + free):
+#   auto_model_pin_service.resolve_or_get_pinned_llm_config_id() splits
+#   candidates by billing_tier based on _is_premium_eligible(user):
+#     premium_eligible == True  -> keeps only tier=="premium" configs
+#     premium_eligible == False -> keeps only tier!="premium" configs
+#   A single-tier fixture would fail one of the two branches with
+#   "Auto mode could not find an eligible LLM config for this user and
+#   quota state". Shipping one of each guarantees every quota state
+#   resolves to a viable pin in E2E.
+
+router_settings:
+  routing_strategy: "simple-shuffle"
+  num_retries: 0
+  allowed_fails: 1
+  cooldown_time: 1
+
+global_llm_configs:
+  - id: -9001
+    name: "E2E Fake Auto Model (premium)"
+    billing_tier: "premium"
+    anonymous_enabled: false
+    seo_enabled: false
+    quality_score: 1.0
+    provider: "OPENAI"
+    model_name: "fake-e2e-model-premium"
+    api_key: "fake-e2e-api-key-not-for-production"
+    supports_image_input: false
+    quota_reserve_tokens: 1024
+    rpm: 1000
+    tpm: 100000
+    litellm_params:
+      model: "openai/fake-e2e-model-premium"
+
+  - id: -9002
+    name: "E2E Fake Auto Model (free)"
+    billing_tier: "free"
+    anonymous_enabled: false
+    seo_enabled: false
+    quality_score: 1.0
+    provider: "OPENAI"
+    model_name: "fake-e2e-model-free"
+    api_key: "fake-e2e-api-key-not-for-production"
+    supports_image_input: false
+    quota_reserve_tokens: 1024
+    rpm: 1000
+    tpm: 100000
+    litellm_params:
+      model: "openai/fake-e2e-model-free"
--- a/surfsense_backend/tests/e2e/run_backend.py
+++ b/surfsense_backend/tests/e2e/run_backend.py
@ -23,15 +23,12 @@ Usage:

 from __future__ import annotations

+import asyncio
 import logging
 import os
 import sys

-# ---------------------------------------------------------------------------
-# 1) Hijack sys.modules BEFORE any production import.
-#    Production: composio_service.py:11 does `from composio import Composio`.
-#    With this hijack in place, that import resolves to our strict fake.
-# ---------------------------------------------------------------------------
+import uvicorn

 # Make the surfsense_backend root importable as a top-level package so
 # `import tests.e2e.fakes...` works regardless of how the entrypoint is
@ -42,97 +39,175 @@ _BACKEND_ROOT = os.path.abspath(os.path.join(_THIS_DIR, "..", ".."))
 if _BACKEND_ROOT not in sys.path:
    sys.path.insert(0, _BACKEND_ROOT)

-import tests.e2e.fakes.composio_module as _fake_composio  # noqa: E402
-import tests.e2e.fakes.notion_module as _fake_notion  # noqa: E402

-sys.modules["composio"] = _fake_composio
-sys.modules["notion_client"] = _fake_notion
-sys.modules["notion_client.errors"] = _fake_notion.errors
-
-
-# ---------------------------------------------------------------------------
-# 2) Standard logging + dotenv so the rest of the app behaves like main.py.
-# ---------------------------------------------------------------------------
-
-from dotenv import load_dotenv  # noqa: E402
-
-load_dotenv()
-os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
-os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
-os.environ.setdefault(
-    "CONFLUENCE_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/confluence/connector/callback",
-)
-os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
-os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
-os.environ.setdefault(
-    "NOTION_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/notion/connector/callback",
-)
-os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
-os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
-os.environ.setdefault(
-    "ONEDRIVE_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
-)
-os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
-os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
-os.environ.setdefault(
-    "DROPBOX_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
-)
-os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
-os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
 logger = logging.getLogger("surfsense.e2e.backend")
-logger.warning(
-    "*** SURFSENSE E2E BACKEND ENTRYPOINT — fake Composio + LLM + embeddings ***"
-)
-
-
-# ---------------------------------------------------------------------------
-# 3) Now import the production app. Every module in app.* loads here,
-#    creating their bindings (some of which we will patch in step 4).
-# ---------------------------------------------------------------------------
-
-# ---------------------------------------------------------------------------
-# 4) Patch LLM + embedding bindings at every consumer site.
-#    Composio is already covered by the sys.modules hijack in step 1.
-# ---------------------------------------------------------------------------
-from unittest.mock import patch  # noqa: E402
-
-from app.app import app  # noqa: E402
-from tests.e2e.fakes import (  # noqa: E402
-    clickup_module as _fake_clickup_module,
-    confluence_indexer as _fake_confluence_indexer,
-    confluence_oauth as _fake_confluence_oauth,
-    dropbox_api as _fake_dropbox_api,
-    embeddings as _fake_embeddings,
-    jira_module as _fake_jira_module,
-    linear_module as _fake_linear_module,
-    mcp_oauth_runtime as _fake_mcp_oauth_runtime,
-    mcp_runtime as _fake_mcp_runtime,
-    native_google as _fake_native_google,
-    notion_module as _fake_notion_module,
-    onedrive_graph as _fake_onedrive_graph,
-    slack_module as _fake_slack_module,
-)
-from tests.e2e.fakes.chat_llm import (  # noqa: E402
-    fake_create_chat_litellm_from_agent_config,
-    fake_create_chat_litellm_from_config,
-)
-from tests.e2e.fakes.llm import fake_get_user_long_context_llm  # noqa: E402

+# Patches started during bootstrap are kept alive for the lifetime of the
+# process. We never call .stop() on them.
 _active_patches: list = []


+def _hijack_external_sdks() -> None:
+    """Replace composio + notion_client in sys.modules.
+
+    Production does ``from composio import Composio`` and
+    ``import notion_client`` at import time. With this hijack in place,
+    those imports resolve to our strict fakes.
+
+    MUST run before _import_production_app().
+    """
+    import tests.e2e.fakes.composio_module as _fake_composio
+    import tests.e2e.fakes.notion_module as _fake_notion
+
+    sys.modules["composio"] = _fake_composio
+    sys.modules["notion_client"] = _fake_notion
+    sys.modules["notion_client.errors"] = _fake_notion.errors
+
+
+def _load_dotenv_and_set_env_defaults() -> None:
+    """Load .env and set every env var the production config reads on import.
+
+    MUST run before _import_production_app(), since app.config consumes
+    these values at import time.
+    """
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    os.environ.setdefault(
+        "DATABASE_URL",
+        "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense",
+    )
+    os.environ.setdefault("CELERY_BROKER_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
+    os.environ.setdefault("REDIS_APP_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
+    os.environ.setdefault("SECRET_KEY", "local-e2e-secret-not-for-production")
+    os.environ.setdefault("AUTH_TYPE", "LOCAL")
+    os.environ.setdefault("REGISTRATION_ENABLED", "TRUE")
+    os.environ.setdefault("ETL_SERVICE", "DOCLING")
+    os.environ.setdefault("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    os.environ.setdefault("NEXT_FRONTEND_URL", "http://localhost:3000")
+
+    # Sentinel keys — fakes never read them; turns leaked real calls into 401s.
+    os.environ.setdefault("COMPOSIO_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("COMPOSIO_ENABLED", "TRUE")
+    os.environ.setdefault("OPENAI_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("ANTHROPIC_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("LITELLM_API_KEY", "local-deny-real-call-sentinel")
+
+    os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
+    os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
+    os.environ.setdefault(
+        "CONFLUENCE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/confluence/connector/callback",
+    )
+    os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
+    os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
+    os.environ.setdefault(
+        "NOTION_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/notion/connector/callback",
+    )
+    os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
+    os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
+    os.environ.setdefault(
+        "ONEDRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
+    )
+    os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
+    os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
+    os.environ.setdefault(
+        "DROPBOX_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
+    )
+    # Native Google OAuth — fake Flow in tests.e2e.fakes.native_google
+    # raises "Fake Google Flow requires redirect_uri." if these are empty,
+    # so connector/add routes return 500 in CI where no .env supplies them.
+    os.environ.setdefault(
+        "GOOGLE_DRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/drive/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_GMAIL_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/gmail/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_CALENDAR_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/calendar/connector/callback",
+    )
+    os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
+    os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
+
+
+def _install_synthetic_global_llm_config() -> None:
+    """Materialise a fake ``app/config/global_llm_config.yaml`` for E2E.
+
+    The real file is gitignored (production operators ship their own with
+    real API keys), so a fresh CI checkout has no YAML at the path
+    ``app.config.load_global_llm_configs()`` reads. With an empty
+    ``GLOBAL_LLM_CONFIGS`` list, ``auto_model_pin_service`` raises
+    ``"No usable global LLM configs are available for Auto mode"`` on
+    every chat-stream request.
+
+    We copy the synthetic fixture from ``tests/e2e/fixtures/`` into the
+    production-expected location BEFORE ``_import_production_app()`` so
+    ``app.config`` picks it up on import. Production code is untouched —
+    this is purely a test-time scaffold.
+
+    Only installs when the destination is missing. A developer running
+    the E2E entrypoint locally keeps their real ``global_llm_config.yaml``
+    intact (the patched ``create_chat_litellm_from_*`` factories make the
+    actual model values irrelevant either way).
+
+    MUST run before _import_production_app().
+    """
+    import shutil
+
+    src = os.path.join(_THIS_DIR, "fixtures", "global_llm_config.yaml")
+    dst = os.path.join(_BACKEND_ROOT, "app", "config", "global_llm_config.yaml")
+
+    if not os.path.exists(src):
+        raise RuntimeError(
+            f"E2E synthetic global LLM config fixture missing at {src!r}. "
+            f"This file is checked into tests/e2e/fixtures/ — if it has gone "
+            f"missing, restore it from VCS before running the E2E entrypoint."
+        )
+
+    if os.path.exists(dst):
+        logger.info(
+            "[e2e-global-llm-config] %s already exists; leaving it alone "
+            "(local dev config preserved)",
+            dst,
+        )
+        return
+
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    shutil.copyfile(src, dst)
+    logger.info("[e2e-global-llm-config] installed %s -> %s", src, dst)
+
+
+def _import_production_app():
+    """Import and return the production FastAPI app.
+
+    Every module under ``app.*`` loads here, creating their bindings.
+    The LLM/embedding factories captured at this point will be replaced
+    by patches in _patch_llm_bindings() below.
+    """
+    from app.app import app as production_app
+
+    return production_app
+
+
 def _patch_llm_bindings() -> None:
    """Replace LLM factories at every known binding site."""
+    from unittest.mock import patch
+
+    from tests.e2e.fakes.chat_llm import (
+        fake_create_chat_litellm_from_agent_config,
+        fake_create_chat_litellm_from_config,
+    )
+    from tests.e2e.fakes.llm import fake_get_user_long_context_llm
+
    targets = [
        "app.services.llm_service.get_user_long_context_llm",
        "app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
@ -190,38 +265,90 @@ def _patch_llm_bindings() -> None:
            logger.warning("[fake-chat-llm] could not patch %s: %s.", target, exc)


-_patch_llm_bindings()
-_fake_embeddings.install(_active_patches)
-_fake_confluence_oauth.install(_active_patches)
-_fake_confluence_indexer.install(_active_patches)
-_fake_native_google.install(_active_patches)
-_fake_onedrive_graph.install(_active_patches)
-_fake_dropbox_api.install(_active_patches)
-_fake_notion_module.install(_active_patches)
-_fake_linear_module.install(_active_patches)
-_fake_jira_module.install(_active_patches)
-_fake_clickup_module.install(_active_patches)
-_fake_mcp_runtime.install(_active_patches)
-_fake_mcp_oauth_runtime.install(_active_patches)
-_fake_slack_module.install(_active_patches)
+def _install_runtime_fakes() -> None:
+    """Run each fake's install() against the active patch stack."""
+    from tests.e2e.fakes import (
+        clickup_module as _fake_clickup_module,
+        confluence_indexer as _fake_confluence_indexer,
+        confluence_oauth as _fake_confluence_oauth,
+        docling_service as _fake_docling_service,
+        dropbox_api as _fake_dropbox_api,
+        embeddings as _fake_embeddings,
+        jira_module as _fake_jira_module,
+        linear_module as _fake_linear_module,
+        mcp_oauth_runtime as _fake_mcp_oauth_runtime,
+        mcp_runtime as _fake_mcp_runtime,
+        native_google as _fake_native_google,
+        notion_module as _fake_notion_module,
+        onedrive_graph as _fake_onedrive_graph,
+        slack_module as _fake_slack_module,
+    )
+
+    _fake_embeddings.install(_active_patches)
+    _fake_docling_service.install(_active_patches)
+    _fake_confluence_oauth.install(_active_patches)
+    _fake_confluence_indexer.install(_active_patches)
+    _fake_native_google.install(_active_patches)
+    _fake_onedrive_graph.install(_active_patches)
+    _fake_dropbox_api.install(_active_patches)
+    _fake_notion_module.install(_active_patches)
+    _fake_linear_module.install(_active_patches)
+    _fake_jira_module.install(_active_patches)
+    _fake_clickup_module.install(_active_patches)
+    _fake_mcp_runtime.install(_active_patches)
+    _fake_mcp_oauth_runtime.install(_active_patches)
+    _fake_slack_module.install(_active_patches)


-# ---------------------------------------------------------------------------
-# 5) Mount test-only middleware. Production never reaches this code.
-# ---------------------------------------------------------------------------
+def _install_test_only_app_extensions(app) -> None:
+    """Mount test-only middleware + the /__e2e__ token mint router.

-from tests.e2e.middleware.scenario import ScenarioMiddleware  # noqa: E402
+    POST /__e2e__/auth/token bypasses /auth/jwt/login's 5/min/IP rate
+    limit so Playwright workers can authenticate without thrashing the
+    production auth surface. See tests/e2e/auth_mint.py.
+    """
+    from tests.e2e.auth_mint import install as install_e2e_mint
+    from tests.e2e.middleware.scenario import ScenarioMiddleware

-app.add_middleware(ScenarioMiddleware)
+    app.add_middleware(ScenarioMiddleware)
+    install_e2e_mint(app)


-# ---------------------------------------------------------------------------
-# 6) Start uvicorn, mirroring main.py's behaviour.
-# ---------------------------------------------------------------------------
+def _bootstrap():
+    """Run the full E2E bootstrap and return the production FastAPI app.

-import asyncio  # noqa: E402
+    Ordering is load-bearing:
+      1) Hijack composio + notion_client in sys.modules.
+      2) Load .env + set env defaults (app.config reads env on import).
+      3) Configure logging.
+      4) Materialise the synthetic global_llm_config.yaml so Auto-mode
+         pin resolution finds at least one usable candidate.
+      5) Import production app (which transitively imports the now-faked
+         external SDKs and reads the env defaults + YAML).
+      6) Patch LLM / embedding bindings at every consumer site.
+      7) Mount test-only middleware + /__e2e__ routes onto the app.
+    """
+    _hijack_external_sdks()
+    _load_dotenv_and_set_env_defaults()

-import uvicorn  # noqa: E402
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logger.warning(
+        "*** SURFSENSE E2E BACKEND ENTRYPOINT — fake Composio + LLM + embeddings ***"
+    )
+
+    _install_synthetic_global_llm_config()
+    production_app = _import_production_app()
+    _patch_llm_bindings()
+    _install_runtime_fakes()
+    _install_test_only_app_extensions(production_app)
+    return production_app
+
+
+app = _bootstrap()


 def _main() -> None:
--- a/surfsense_backend/tests/e2e/run_celery.py
+++ b/surfsense_backend/tests/e2e/run_celery.py
@ -25,96 +25,166 @@ if _BACKEND_ROOT not in sys.path:
    sys.path.insert(0, _BACKEND_ROOT)


-# ---------------------------------------------------------------------------
-# 1) Hijack sys.modules BEFORE production celery imports anything.
-# ---------------------------------------------------------------------------
-
-import tests.e2e.fakes.composio_module as _fake_composio  # noqa: E402
-import tests.e2e.fakes.notion_module as _fake_notion  # noqa: E402
-
-sys.modules["composio"] = _fake_composio
-sys.modules["notion_client"] = _fake_notion
-sys.modules["notion_client.errors"] = _fake_notion.errors
-
-
-# ---------------------------------------------------------------------------
-# 2) Logging + dotenv.
-# ---------------------------------------------------------------------------
-
-from dotenv import load_dotenv  # noqa: E402
-
-load_dotenv()
-os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
-os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
-os.environ.setdefault(
-    "CONFLUENCE_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/confluence/connector/callback",
-)
-os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
-os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
-os.environ.setdefault(
-    "NOTION_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/notion/connector/callback",
-)
-os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
-os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
-os.environ.setdefault(
-    "ONEDRIVE_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
-)
-os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
-os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
-os.environ.setdefault(
-    "DROPBOX_REDIRECT_URI",
-    "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
-)
-os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
-os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
 logger = logging.getLogger("surfsense.e2e.celery")
-logger.warning("*** SURFSENSE E2E CELERY WORKER — fake Composio + LLM + embeddings ***")
-
-
-# ---------------------------------------------------------------------------
-# 3) Import the production celery_app. All task modules load here.
-# ---------------------------------------------------------------------------
-
-# ---------------------------------------------------------------------------
-# 4) Patch LLM + embedding bindings inside the worker process.
-# ---------------------------------------------------------------------------
-from unittest.mock import patch  # noqa: E402
-
-from app.celery_app import celery_app  # noqa: E402
-from tests.e2e.fakes import (  # noqa: E402
-    clickup_module as _fake_clickup_module,
-    confluence_indexer as _fake_confluence_indexer,
-    confluence_oauth as _fake_confluence_oauth,
-    dropbox_api as _fake_dropbox_api,
-    embeddings as _fake_embeddings,
-    jira_module as _fake_jira_module,
-    linear_module as _fake_linear_module,
-    mcp_oauth_runtime as _fake_mcp_oauth_runtime,
-    mcp_runtime as _fake_mcp_runtime,
-    native_google as _fake_native_google,
-    notion_module as _fake_notion_module,
-    onedrive_graph as _fake_onedrive_graph,
-    slack_module as _fake_slack_module,
-)
-from tests.e2e.fakes.chat_llm import (  # noqa: E402
-    fake_create_chat_litellm_from_agent_config,
-    fake_create_chat_litellm_from_config,
-)
-from tests.e2e.fakes.llm import fake_get_user_long_context_llm  # noqa: E402

+# Patches started during bootstrap are kept alive for the lifetime of the
+# process. We never call .stop() on them.
 _active_patches: list = []


+def _hijack_external_sdks() -> None:
+    """Replace composio + notion_client in sys.modules.
+
+    Production does ``from composio import Composio`` and
+    ``import notion_client`` at import time. With this hijack in place,
+    those imports resolve to our strict fakes.
+
+    MUST run before _import_celery_app().
+    """
+    import tests.e2e.fakes.composio_module as _fake_composio
+    import tests.e2e.fakes.notion_module as _fake_notion
+
+    sys.modules["composio"] = _fake_composio
+    sys.modules["notion_client"] = _fake_notion
+    sys.modules["notion_client.errors"] = _fake_notion.errors
+
+
+def _load_dotenv_and_set_env_defaults() -> None:
+    """Load .env and set every env var the production config reads on import.
+
+    MUST run before _import_celery_app(), since app.config consumes
+    these values at import time.
+    """
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    os.environ.setdefault(
+        "DATABASE_URL",
+        "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense",
+    )
+    os.environ.setdefault("CELERY_BROKER_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
+    os.environ.setdefault("REDIS_APP_URL", "redis://localhost:6379/0")
+    os.environ.setdefault("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
+    os.environ.setdefault("SECRET_KEY", "local-e2e-secret-not-for-production")
+    os.environ.setdefault("AUTH_TYPE", "LOCAL")
+    os.environ.setdefault("REGISTRATION_ENABLED", "TRUE")
+    os.environ.setdefault("ETL_SERVICE", "DOCLING")
+    os.environ.setdefault("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    os.environ.setdefault("NEXT_FRONTEND_URL", "http://localhost:3000")
+
+    # Sentinel keys — fakes never read them; turns leaked real calls into 401s.
+    os.environ.setdefault("COMPOSIO_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("COMPOSIO_ENABLED", "TRUE")
+    os.environ.setdefault("OPENAI_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("ANTHROPIC_API_KEY", "local-deny-real-call-sentinel")
+    os.environ.setdefault("LITELLM_API_KEY", "local-deny-real-call-sentinel")
+
+    os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
+    os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
+    os.environ.setdefault(
+        "CONFLUENCE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/confluence/connector/callback",
+    )
+    os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
+    os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
+    os.environ.setdefault(
+        "NOTION_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/notion/connector/callback",
+    )
+    os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
+    os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
+    os.environ.setdefault(
+        "ONEDRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/onedrive/connector/callback",
+    )
+    os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
+    os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
+    os.environ.setdefault(
+        "DROPBOX_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/dropbox/connector/callback",
+    )
+    # Native Google OAuth — fake Flow in tests.e2e.fakes.native_google raises
+    # "Fake Google Flow requires redirect_uri." when these are empty.
+    os.environ.setdefault(
+        "GOOGLE_DRIVE_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/drive/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_GMAIL_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/gmail/connector/callback",
+    )
+    os.environ.setdefault(
+        "GOOGLE_CALENDAR_REDIRECT_URI",
+        "http://localhost:8000/api/v1/auth/google/calendar/connector/callback",
+    )
+    os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
+    os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
+
+
+def _install_synthetic_global_llm_config() -> None:
+    """Materialise a fake ``app/config/global_llm_config.yaml`` for E2E.
+
+    The real file is gitignored (production operators ship their own with
+    real API keys), so a fresh CI checkout has no YAML at the path
+    ``app.config.load_global_llm_configs()`` reads. With an empty
+    ``GLOBAL_LLM_CONFIGS`` list, the worker's view of the config diverges
+    from the API container.
+
+    We copy the synthetic fixture from ``tests/e2e/fixtures/`` into the
+    production-expected location BEFORE _import_celery_app() so
+    ``app.config`` picks it up on import. Install-only-if-missing so a
+    developer's local config (with real API keys) is preserved.
+
+    MUST run before _import_celery_app().
+    """
+    import shutil
+
+    src = os.path.join(_THIS_DIR, "fixtures", "global_llm_config.yaml")
+    dst = os.path.join(_BACKEND_ROOT, "app", "config", "global_llm_config.yaml")
+
+    if not os.path.exists(src):
+        raise RuntimeError(
+            f"E2E synthetic global LLM config fixture missing at {src!r}. "
+            f"Restore tests/e2e/fixtures/global_llm_config.yaml from VCS."
+        )
+
+    if os.path.exists(dst):
+        logger.info(
+            "[e2e-global-llm-config] %s already exists; leaving it alone "
+            "(local dev config preserved)",
+            dst,
+        )
+        return
+
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    shutil.copyfile(src, dst)
+    logger.info("[e2e-global-llm-config] installed %s -> %s", src, dst)
+
+
+def _import_celery_app():
+    """Import and return the production Celery app.
+
+    Every module under ``app.*`` (including all task modules) loads here,
+    creating their bindings. The LLM/embedding factories captured at this
+    point will be replaced by patches in _patch_llm_bindings() below.
+    """
+    from app.celery_app import celery_app
+
+    return celery_app
+
+
 def _patch_llm_bindings() -> None:
+    """Replace LLM factories at every known binding site in worker tasks."""
+    from unittest.mock import patch
+
+    from tests.e2e.fakes.chat_llm import (
+        fake_create_chat_litellm_from_agent_config,
+        fake_create_chat_litellm_from_config,
+    )
+    from tests.e2e.fakes.llm import fake_get_user_long_context_llm
+
    targets = [
        "app.services.llm_service.get_user_long_context_llm",
        "app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
@ -172,38 +242,93 @@ def _patch_llm_bindings() -> None:
            )


-_patch_llm_bindings()
-_fake_embeddings.install(_active_patches)
-_fake_confluence_oauth.install(_active_patches)
-_fake_confluence_indexer.install(_active_patches)
-_fake_native_google.install(_active_patches)
-_fake_onedrive_graph.install(_active_patches)
-_fake_dropbox_api.install(_active_patches)
-_fake_notion_module.install(_active_patches)
-_fake_linear_module.install(_active_patches)
-_fake_jira_module.install(_active_patches)
-_fake_clickup_module.install(_active_patches)
-_fake_mcp_runtime.install(_active_patches)
-_fake_mcp_oauth_runtime.install(_active_patches)
-_fake_slack_module.install(_active_patches)
+def _install_runtime_fakes() -> None:
+    """Run each fake's install() against the active patch stack."""
+    from tests.e2e.fakes import (
+        clickup_module as _fake_clickup_module,
+        confluence_indexer as _fake_confluence_indexer,
+        confluence_oauth as _fake_confluence_oauth,
+        docling_service as _fake_docling_service,
+        dropbox_api as _fake_dropbox_api,
+        embeddings as _fake_embeddings,
+        jira_module as _fake_jira_module,
+        linear_module as _fake_linear_module,
+        mcp_oauth_runtime as _fake_mcp_oauth_runtime,
+        mcp_runtime as _fake_mcp_runtime,
+        native_google as _fake_native_google,
+        notion_module as _fake_notion_module,
+        onedrive_graph as _fake_onedrive_graph,
+        slack_module as _fake_slack_module,
+    )
+
+    _fake_embeddings.install(_active_patches)
+    _fake_docling_service.install(_active_patches)
+    _fake_confluence_oauth.install(_active_patches)
+    _fake_confluence_indexer.install(_active_patches)
+    _fake_native_google.install(_active_patches)
+    _fake_onedrive_graph.install(_active_patches)
+    _fake_dropbox_api.install(_active_patches)
+    _fake_notion_module.install(_active_patches)
+    _fake_linear_module.install(_active_patches)
+    _fake_jira_module.install(_active_patches)
+    _fake_clickup_module.install(_active_patches)
+    _fake_mcp_runtime.install(_active_patches)
+    _fake_mcp_oauth_runtime.install(_active_patches)
+    _fake_slack_module.install(_active_patches)


-# ---------------------------------------------------------------------------
-# 5) Start the worker.
-# ---------------------------------------------------------------------------
+def _bootstrap():
+    """Run the full E2E bootstrap and return the production Celery app.
+
+    Ordering is load-bearing:
+      1) Hijack composio + notion_client in sys.modules.
+      2) Load .env + set env defaults (app.config reads env on import).
+      3) Configure logging.
+      4) Materialise the synthetic global_llm_config.yaml so the worker's
+         view of GLOBAL_LLM_CONFIGS matches the API container.
+      5) Import production celery_app (which transitively imports the
+         now-faked external SDKs and reads the env defaults + YAML).
+      6) Patch LLM / embedding bindings at every consumer site.
+      7) Install runtime fakes for connectors and chat backends.
+    """
+    _hijack_external_sdks()
+    _load_dotenv_and_set_env_defaults()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logger.warning(
+        "*** SURFSENSE E2E CELERY WORKER — fake Composio + LLM + embeddings ***"
+    )
+
+    _install_synthetic_global_llm_config()
+    celery_app = _import_celery_app()
+    _patch_llm_bindings()
+    _install_runtime_fakes()
+    return celery_app
+
+
+celery_app = _bootstrap()


 def _main() -> None:
-    # Default queues mirror production (default queue + connectors queue
-    # so Drive indexing tasks are picked up).
    queue_name = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
    queues = f"{queue_name},{queue_name}.connectors"
+
+    # macOS forks-after-MPS-init crash prefork workers; threads avoid it.
+    default_pool = "threads" if sys.platform == "darwin" else "prefork"
+    pool = os.getenv("CELERY_POOL", default_pool)
+    concurrency = os.getenv("CELERY_CONCURRENCY", "2")
+
    celery_app.worker_main(
        argv=[
            "worker",
            "--loglevel=info",
            f"--queues={queues}",
-            "--concurrency=2",
+            f"--pool={pool}",
+            f"--concurrency={concurrency}",
            "--without-gossip",
            "--without-mingle",
        ]
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@ -741,6 +741,372 @@ async def test_extract_image_falls_back_to_document_without_vision_llm(
    assert result.content_type == "document"


+# ---------------------------------------------------------------------------
+# Document path with vision LLM: per-image descriptions are appended
+# ---------------------------------------------------------------------------
+
+
+def _fake_extraction_result(*descriptions):
+    from app.etl_pipeline.picture_describer import (
+        PictureDescription,
+        PictureExtractionResult,
+    )
+
+    return PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=d["page"],
+                ordinal_in_page=d.get("ordinal", 0),
+                name=d["name"],
+                sha256=d.get("sha", "deadbeef"),
+                description=d["desc"],
+            )
+            for d in descriptions
+        ]
+    )
+
+
+async def test_extract_pdf_with_vision_llm_inlines_image_blocks(tmp_path, mocker):
+    """A PDF with an `<!-- image -->` placeholder + caption gets the
+    block spliced inline (no orphaned ``## Image Content`` section).
+
+    This is the headline scenario for the medxpertqa benchmark: the
+    image content lives in the same chunk as the surrounding case text
+    so retrieval pulls the question, image, and answer options together.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": (
+            "# MedXpertQA-MM MM-130\n\n"
+            "## Clinical case\n\nA 44-year-old man...\n\n"
+            "<!-- image -->\nImage: MM-130-a.jpeg\n\n"
+            "## Answer choices\n\nA) ...\n"
+        )
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    extraction = _fake_extraction_result(
+        {
+            "page": 1,
+            "name": "Im0",
+            "desc": "Axial CT showing a large cystic mass.",
+        }
+    )
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=extraction),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    md = result.markdown_content
+    # The placeholder + caption are gone, replaced by a horizontal-
+    # rule-delimited section with the captioned filename.
+    assert "<!-- image -->" not in md
+    assert "Image: MM-130-a.jpeg" not in md
+    assert "**Embedded image:** `MM-130-a.jpeg`" in md
+    assert "**Visual description:**" in md
+    assert "Axial CT showing a large cystic mass." in md
+    # No OCR section -- our fake_extraction_result has no ocr_text,
+    # and the format omits the section when there's no text to show.
+    assert "**OCR text:**" not in md
+    # No raw HTML / XML tags or blockquote wrapping leak.
+    assert "<image" not in md
+    assert "> **Embedded image:**" not in md
+    # No appended section -- everything went inline.
+    assert "## Image Content" not in md
+    # Surrounding case text + answer options are preserved.
+    assert "A 44-year-old man..." in md
+    assert "## Answer choices" in md
+    assert "A) ..." in md
+
+
+async def test_extract_pdf_with_vision_llm_appends_when_no_marker(tmp_path, mocker):
+    """When parser markdown has no image markers, descriptions get appended.
+
+    This is the fallback path for parsers that drop image placeholders
+    entirely. The image content still ends up in the markdown -- just
+    in a clearly-labeled section rather than inline.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": "# Parsed PDF text\n\nNo image markers anywhere.\n"
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    extraction = _fake_extraction_result(
+        {"page": 1, "name": "Im0", "desc": "An image description."}
+    )
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=extraction),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    md = result.markdown_content
+    assert "# Parsed PDF text" in md
+    assert "## Image Content (vision-LLM extracted)" in md
+    assert "**Embedded image:** `Im0`" in md
+    assert "An image description." in md
+
+
+async def test_extract_pdf_without_vision_llm_skips_picture_descriptions(
+    tmp_path, mocker
+):
+    """No vision LLM -> parser markdown returned as-is."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    describe_mock = mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(),
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF text"
+    assert "<image" not in result.markdown_content
+    describe_mock.assert_not_called()
+
+
+async def test_extract_pdf_with_vision_llm_swallows_describe_failure(
+    tmp_path, mocker
+):
+    """A pypdf or vision LLM blow-up never fails the document upload."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(side_effect=RuntimeError("pypdf exploded")),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF text"
+    assert result.etl_service == "DOCLING"
+
+
+async def test_extract_pdf_with_vision_llm_no_images_returns_parser_text(
+    tmp_path, mocker
+):
+    """Vision-LLM-enabled PDF with zero extracted images is unchanged."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Just text, no images"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    empty = _fake_extraction_result()
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=empty),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Just text, no images"
+    assert "<image" not in result.markdown_content
+
+
+# ---------------------------------------------------------------------------
+# Per-image OCR runner: wiring + behaviour
+#
+# When extracting a PDF with a vision LLM, the ETL service must ALSO
+# pass an ``ocr_runner`` to picture_describer. The runner is a closure
+# that re-feeds each extracted image through a vision-LLM-less
+# EtlPipelineService -- i.e. the same OCR engine that handles
+# standalone image uploads (Docling/Azure DI/LlamaCloud) gets a crack
+# at each embedded image, with the text attached to the inline block.
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(
+    tmp_path, mocker
+):
+    """The ETL service must wire an ocr_runner kwarg to describe_pictures."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    describe_mock = mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=_fake_extraction_result()),
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    describe_mock.assert_awaited_once()
+    _, kwargs = describe_mock.await_args
+    assert "ocr_runner" in kwargs
+    assert callable(kwargs["ocr_runner"])
+
+
+async def test_extract_pdf_ocr_runner_invokes_document_parser_on_image(
+    tmp_path, mocker
+):
+    """The OCR runner closure should re-extract each image via the parser.
+
+    We capture the runner that the ETL service passes to
+    describe_pictures, invoke it with a fake image path, and assert
+    that Docling was called with that image. This proves the closure
+    is wired to a vision-LLM-less sub-pipeline (otherwise it would
+    recurse into the vision LLM and never hit the OCR engine).
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+    image_file = tmp_path / "Im0.png"
+    image_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": "Slice 24 / 60   L   R"
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    captured: dict = {}
+
+    async def capture_runner(*args, **kwargs):
+        captured["runner"] = kwargs["ocr_runner"]
+        return _fake_extraction_result()
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=capture_runner,
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    runner = captured["runner"]
+    ocr_text = await runner(str(image_file), "Im0.png")
+
+    assert ocr_text == "Slice 24 / 60   L   R"
+    # Docling was invoked twice in total: once for the PDF, once for
+    # the image we re-fed via the runner.
+    assert fake_docling.process_document.await_count == 2
+
+
+async def test_extract_pdf_ocr_runner_returns_empty_on_unsupported_image(
+    tmp_path, mocker
+):
+    """Unsupported image format → runner returns empty string, doesn't raise.
+
+    Common case: a PDF embeds a JPEG2000 or CCITT-TIFF image that
+    Docling can't load. We don't want an unsupported format on ONE
+    embedded image to spoil the whole PDF extraction; the runner
+    should swallow the EtlUnsupportedFileError and return "" so the
+    image gets a description but no OCR tag.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+    weird_image = tmp_path / "Im0.jp2"  # JPEG2000, unlikely to be supported
+    weird_image.write_bytes(b"\x00\x00\x00\x0CjP" + b"\x00" * 50)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    captured: dict = {}
+
+    async def capture_runner(*args, **kwargs):
+        captured["runner"] = kwargs["ocr_runner"]
+        return _fake_extraction_result()
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=capture_runner,
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    runner = captured["runner"]
+    ocr_text = await runner(str(weird_image), "Im0.jp2")
+
+    assert ocr_text == ""
+
+
 # ---------------------------------------------------------------------------
 # Processing Mode enum tests
 # ---------------------------------------------------------------------------
--- a/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py
@ -0,0 +1,967 @@
+"""Unit tests for the picture_describer module.
+
+Covers:
+
+- :func:`describe_pictures` -- the PDF image walker + per-image vision
+  LLM call (structured output split into ``ocr_text`` and
+  ``description``);
+- :func:`inject_descriptions_inline` -- in-place replacement of image
+  placeholders / captions in the parser markdown;
+- :func:`merge_descriptions_into_markdown` -- the top-level helper
+  that inlines what it can and appends what it can't;
+- :func:`render_appended_section` -- the appended-fallback renderer.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.etl_pipeline.picture_describer import (
+    PictureDescription,
+    PictureExtractionResult,
+    describe_pictures,
+    inject_descriptions_inline,
+    merge_descriptions_into_markdown,
+    render_appended_section,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _make_image_obj(name: str, data: bytes):
+    """Mimic pypdf's ImageFile object shape for the bits we use."""
+    img = MagicMock()
+    img.name = name
+    img.data = data
+    return img
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: short-circuits
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_no_op_for_non_pdf(tmp_path):
+    """Non-PDF files are silently no-op'd; we don't try to extract images."""
+    docx_file = tmp_path / "report.docx"
+    docx_file.write_bytes(b"PK fake docx")
+
+    fake_llm = AsyncMock()
+    result = await describe_pictures(str(docx_file), "report.docx", fake_llm)
+
+    assert result.descriptions == []
+    assert result.skipped_too_large == 0
+    fake_llm.ainvoke.assert_not_called()
+
+
+async def test_describe_pictures_no_op_when_vision_llm_is_none(tmp_path):
+    """If the caller didn't provide a vision LLM, we no-op even for PDFs."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    result = await describe_pictures(str(pdf_file), "report.pdf", None)
+    assert result.descriptions == []
+
+
+async def test_describe_pictures_no_op_for_pdf_with_no_images(tmp_path, mocker):
+    """A PDF that pypdf can open but contains zero images returns empty."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[]), MagicMock(images=[])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    fake_llm = AsyncMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert result.descriptions == []
+    fake_llm.ainvoke.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: happy paths
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_runs_vision_llm_per_image(tmp_path, mocker):
+    """Every eligible image gets exactly one description-only vision call."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
+    page1 = MagicMock(images=[img_a])
+    page2 = MagicMock(images=[img_b])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page1, page2]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=["Description A", "Description B"]),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 2
+    by_name = {d.name: d.description for d in result.descriptions}
+    assert by_name == {"Im0.jpeg": "Description A", "Im1.png": "Description B"}
+    assert all(d.page_number in (1, 2) for d in result.descriptions)
+    assert parse_mock.await_count == 2
+
+
+async def test_describe_pictures_dedups_by_hash(tmp_path, mocker):
+    """An image that appears N times in the PDF is described once."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    payload = b"\x89PNG\r\n\x1a\n" + b"\x42" * 2000
+    img = _make_image_obj("logo.png", payload)
+    page1 = MagicMock(images=[img])
+    page2 = MagicMock(images=[_make_image_obj("logo.png", payload)])
+    page3 = MagicMock(images=[_make_image_obj("logo.png", payload)])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page1, page2, page3]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="Logo desc"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.skipped_duplicate == 2
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_skips_too_small_images(tmp_path, mocker):
+    """Sub-1KB images (tracking pixels, dots, etc.) are skipped."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    tiny = _make_image_obj("dot.png", b"\x89PNG\r\n\x1a\n")
+    big = _make_image_obj("ct.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 3000)
+    page = MagicMock(images=[tiny, big])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="CT scan"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].name == "ct.jpeg"
+    assert result.skipped_too_small == 1
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_skips_too_large_images(tmp_path, mocker):
+    """Images larger than the vision LLM's per-image cap are skipped."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    huge = _make_image_obj("huge.jpeg", b"\xff" * (6 * 1024 * 1024))
+    ok = _make_image_obj("ok.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    page = MagicMock(images=[huge, ok])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="OK image"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].name == "ok.jpeg"
+    assert result.skipped_too_large == 1
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_swallows_per_image_failure(tmp_path, mocker):
+    """A vision LLM failure on one image must not kill the whole document."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("a.jpeg", b"\xff\xd8" + b"\xab" * 2000)
+    img_b = _make_image_obj("b.jpeg", b"\xff\xd8" + b"\xcd" * 2000)
+    page = MagicMock(images=[img_a, img_b])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=[RuntimeError("vision blew up"), "Success"]),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].description == "Success"
+    assert result.failed == 1
+
+
+async def test_describe_pictures_handles_pypdf_open_failure(tmp_path, mocker):
+    """A malformed PDF that pypdf can't open returns an empty result."""
+    pdf_file = tmp_path / "broken.pdf"
+    pdf_file.write_bytes(b"not a pdf")
+
+    mocker.patch("pypdf.PdfReader", side_effect=ValueError("EOF marker not found"))
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "broken.pdf", fake_llm)
+    assert result.descriptions == []
+
+
+# ---------------------------------------------------------------------------
+# inject_descriptions_inline: replacement patterns
+# ---------------------------------------------------------------------------
+
+
+def _desc(name="Im0", description="A CT scan."):
+    return PictureDescription(
+        page_number=1,
+        ordinal_in_page=0,
+        name=name,
+        sha256="aa",
+        description=description,
+    )
+
+
+def test_inject_no_op_when_no_descriptions():
+    markdown = "# Title\n\nbody text\n"
+    result = PictureExtractionResult()
+    out, n = inject_descriptions_inline(markdown, result)
+    assert out == markdown
+    assert n == 0
+
+
+def test_inject_replaces_placeholder_with_caption():
+    """`<!-- image -->` + `Image: <name>` together becomes one block.
+
+    This is the most common medxpertqa case: our renderer puts a caption
+    line right below the embedded JPEG, and Docling preserves both.
+    """
+    markdown = (
+        "# Case\n\n"
+        "Clinical text...\n\n"
+        "<!-- image -->\nImage: MM-130-a.jpeg\n\n"
+        "Answer choices: A) ...\n"
+    )
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "<!-- image -->" not in out
+    assert "Image: MM-130-a.jpeg" not in out  # caption consumed
+    # New format: horizontal-rule-delimited section with "Embedded
+    # image:" anchor and named "Visual description:" section. No
+    # blockquote wrapping -- nested blocks (lists, code, tables) inside
+    # a blockquote are silently dropped by Streamdown / remark.
+    assert "**Embedded image:** `MM-130-a.jpeg`" in out
+    assert "**Visual description:**" in out
+    assert "A CT scan." in out
+    # Block is delimited by horizontal rules so it stands out from
+    # surrounding paragraphs.
+    assert "\n---\n" in out
+    # No OCR section -- this fixture has no ocr_text on its descriptions.
+    assert "**OCR text:**" not in out
+    # No raw HTML tags / blockquote prefixes leak.
+    assert "<image" not in out
+    assert "</image>" not in out
+    assert "> **Embedded image:**" not in out  # we no longer wrap in `>`
+    # Surrounding context is preserved.
+    assert "Clinical text..." in out
+    assert "Answer choices: A) ..." in out
+
+
+def test_inject_uses_pypdf_name_when_no_caption():
+    """`<!-- image -->` alone uses the pypdf-given name as the attribute."""
+    markdown = "# Case\n\n<!-- image -->\n\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `Im0`" in out
+
+
+def test_inject_replaces_bare_caption():
+    """A bare `Image: <name>` line (no placeholder) still gets replaced."""
+    markdown = "# Case\n\nText...\nImage: scan.jpeg\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "Image: scan.jpeg" not in out
+
+
+def test_inject_handles_multiple_images_in_order():
+    """Two placeholders + two descriptions: each consumed in document order."""
+    markdown = (
+        "Page 1\n\n<!-- image -->\nImage: a.jpeg\n\n"
+        "Between\n\n<!-- image -->\nImage: b.jpeg\n\nEnd\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
+                description="Desc A",
+            ),
+            PictureDescription(
+                page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
+                description="Desc B",
+            ),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    assert "**Embedded image:** `a.jpeg`" in out
+    assert "**Embedded image:** `b.jpeg`" in out
+    assert out.index("a.jpeg") < out.index("b.jpeg")
+    assert "Desc A" in out and "Desc B" in out
+
+
+def test_inject_returns_remaining_count_when_more_descriptions_than_markers():
+    """Three descriptions, one marker -> only one inlined, two leftover."""
+    markdown = "Just one <!-- image --> here.\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First"),
+            _desc(name="Im1", description="Second"),
+            _desc(name="Im2", description="Third"),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `Im0`" in out
+    assert "**Embedded image:** `Im1`" not in out
+
+
+def test_inject_returns_zero_when_no_markers_present():
+    """Markdown with no image markers at all returns the input unchanged."""
+    markdown = "# Title\n\nJust text. No images mentioned at all.\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 0
+    assert out == markdown
+
+
+# ---------------------------------------------------------------------------
+# render_appended_section
+# ---------------------------------------------------------------------------
+
+
+def test_render_appended_empty_when_nothing_passed():
+    assert render_appended_section([]) == ""
+
+
+def test_render_appended_renders_each_image_as_block():
+    descriptions = [
+        _desc(name="MM-130-a.jpeg", description="CT scan"),
+        _desc(name="MM-130-b.jpeg", description="Bar chart"),
+    ]
+    rendered = render_appended_section(descriptions)
+    assert "## Image Content (vision-LLM extracted)" in rendered
+    assert "**Embedded image:** `MM-130-a.jpeg`" in rendered
+    assert "CT scan" in rendered
+    assert "**Embedded image:** `MM-130-b.jpeg`" in rendered
+    assert "Bar chart" in rendered
+    # Each image block is delimited by horizontal rules.
+    assert rendered.count("\n---\n") >= 2
+    # No raw HTML / XML / blockquote prefixes.
+    assert "<image" not in rendered
+    assert "> **Embedded image:**" not in rendered
+    assert "**OCR text:**" not in rendered
+
+
+def test_render_appended_includes_skip_notes():
+    descriptions = [_desc()]
+    skip_result = PictureExtractionResult(
+        descriptions=descriptions,
+        skipped_too_small=2,
+        skipped_too_large=1,
+        skipped_duplicate=3,
+        failed=1,
+    )
+    rendered = render_appended_section(descriptions, skip_notes=skip_result)
+    assert "_Note:" in rendered
+    assert "2 too small" in rendered
+    assert "1 too large" in rendered
+    assert "3 duplicate" in rendered
+    assert "1 failed" in rendered
+
+
+# ---------------------------------------------------------------------------
+# merge_descriptions_into_markdown: top-level
+# ---------------------------------------------------------------------------
+
+
+def test_merge_inlines_when_marker_present():
+    markdown = "Text...\n\n<!-- image -->\nImage: scan.jpeg\n\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "**Embedded image:** `scan.jpeg`" in out
+    # Nothing leaked into an appended section -- we should NOT see the
+    # appended-section heading because everything went inline.
+    assert "## Image Content" not in out
+
+
+def test_merge_appends_when_no_marker_present():
+    """Zero markers means everything goes into an appended section."""
+    markdown = "Pure text doc, no image markers.\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="An image desc.")]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "Pure text doc" in out
+    assert "## Image Content (vision-LLM extracted)" in out
+    assert "**Embedded image:** `Im0`" in out
+
+
+def test_merge_appends_leftovers_with_distinct_heading():
+    """One marker, two descriptions -> one inline, second appended under
+    a heading that signals it's a leftover.
+    """
+    markdown = "Text\n\n<!-- image -->\nImage: a.jpeg\n\nEnd\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First"),
+            _desc(name="Im1", description="Second"),
+        ]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "**Embedded image:** `a.jpeg`" in out  # inlined
+    assert "## Image Content (additional, no inline marker found)" in out
+    assert "**Embedded image:** `Im1`" in out  # appended
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: ocr_runner integration
+#
+# These tests cover the per-image OCR side-channel: when the caller
+# supplies an ``ocr_runner`` callable, each extracted image is sent
+# both to the vision LLM (visual description) and to the OCR runner
+# (text-in-image), in parallel. The OCR text -- if any -- is recorded
+# on the PictureDescription and rendered in the inline block.
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_calls_ocr_runner_per_image(tmp_path, mocker):
+    """When an ocr_runner is provided, it's invoked once per eligible image."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img_a, img_b])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=["Visual A", "Visual B"]),
+    )
+    ocr_runner = AsyncMock(side_effect=["OCR text A", "OCR text B"])
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert ocr_runner.await_count == 2
+    by_name = {d.name: d.ocr_text for d in result.descriptions}
+    assert by_name == {"Im0.jpeg": "OCR text A", "Im1.png": "OCR text B"}
+
+
+async def test_describe_pictures_runs_vision_and_ocr_in_parallel(
+    tmp_path, mocker
+):
+    """Vision LLM and OCR run concurrently per image, not sequentially.
+
+    We verify this by recording call timestamps: if both finish within
+    a small window relative to the per-call sleep, they ran in parallel.
+    """
+    import asyncio
+    import time
+
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    sleep_each = 0.05  # 50ms per call
+
+    async def slow_vision(*args, **kwargs):
+        await asyncio.sleep(sleep_each)
+        return "Visual"
+
+    async def slow_ocr(*args, **kwargs):
+        await asyncio.sleep(sleep_each)
+        return "OCR"
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=slow_vision,
+    )
+
+    fake_llm = MagicMock()
+    started = time.perf_counter()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=slow_ocr
+    )
+    elapsed = time.perf_counter() - started
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text == "OCR"
+    # Sequential would be ~2*sleep_each. Parallel is ~1*sleep_each + overhead.
+    # Be generous with the bound so we're not flaky on slow CI.
+    assert elapsed < 1.5 * sleep_each, (
+        f"vision+OCR appear to be sequential (took {elapsed:.3f}s)"
+    )
+
+
+async def test_describe_pictures_treats_empty_ocr_as_none(tmp_path, mocker):
+    """Empty / whitespace-only OCR result is normalised to None.
+
+    This means the rendered image block won't carry an empty
+    "OCR text" section for images that contain no text at all
+    (e.g. a clean radiograph).
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="A radiograph."),
+    )
+    ocr_runner = AsyncMock(return_value="   \n  \n")
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text is None
+
+
+async def test_describe_pictures_swallows_ocr_runner_failure(tmp_path, mocker):
+    """An OCR runner exception must not kill the description for that image.
+
+    OCR is supplementary; the vision LLM's description is the primary
+    payload. If OCR blows up we drop the OCR field for that image and
+    keep the description.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="A radiograph."),
+    )
+    ocr_runner = AsyncMock(side_effect=RuntimeError("OCR backend down"))
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].description == "A radiograph."
+    assert result.descriptions[0].ocr_text is None
+    assert result.failed == 0  # the IMAGE didn't fail; only its OCR did
+
+
+async def test_describe_pictures_vision_failure_with_ocr_runner_skips_image(
+    tmp_path, mocker
+):
+    """If the vision LLM fails, the image is skipped even if OCR succeeded.
+
+    The inline block's primary purpose is the visual description; an
+    OCR-only block would be misleading (it'd look like the vision
+    pipeline ran when it didn't), so we treat vision failure as image
+    failure regardless of OCR outcome.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=RuntimeError("vision blew up")),
+    )
+    ocr_runner = AsyncMock(return_value="OCR text")
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert result.descriptions == []
+    assert result.failed == 1
+
+
+async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(
+    tmp_path, mocker
+):
+    """Backward compat: omitting ocr_runner produces description-only blocks."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="Visual"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text is None
+
+
+# ---------------------------------------------------------------------------
+# Rendering: "OCR text" section appears iff PictureDescription.ocr_text is set
+# ---------------------------------------------------------------------------
+
+
+def _desc_with_ocr(name="Im0", description="A CT scan.", ocr_text="L  R  10mm"):
+    return PictureDescription(
+        page_number=1,
+        ordinal_in_page=0,
+        name=name,
+        sha256="aa",
+        description=description,
+        ocr_text=ocr_text,
+    )
+
+
+def test_inject_renders_ocr_section_when_ocr_text_present():
+    markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc_with_ocr(name="Im0", ocr_text="L  R  10mm")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "**OCR text:**" in out
+    assert "L  R  10mm" in out
+    # OCR section comes before the visual description (literal text
+    # first, interpretation second).
+    assert out.index("**OCR text:**") < out.index("**Visual description:**")
+    # Critical: no nested-block constructs (fenced code, blockquote)
+    # that previous formats relied on -- both broke in Streamdown /
+    # PlateJS by escaping their container and dropping content.
+    assert "```" not in out
+    assert "> **" not in out
+
+
+def test_inject_renders_multiline_ocr_with_hard_breaks():
+    """Multi-line OCR uses trailing-two-spaces hard breaks so each
+    line renders on its own row, without needing a fragile fenced
+    code block or blockquote wrapper."""
+    markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
+    ocr_multi = "Slice 24 / 60\nL\nR\n10 mm"
+    result = PictureExtractionResult(
+        descriptions=[_desc_with_ocr(name="Im0", ocr_text=ocr_multi)]
+    )
+
+    out, _ = inject_descriptions_inline(markdown, result)
+
+    # Every OCR line is present.
+    for line in ("Slice 24 / 60", "L", "R", "10 mm"):
+        assert line in out
+    # Non-last OCR lines get the trailing two-space hard break.
+    assert "Slice 24 / 60  \n" in out
+    assert "\nL  \n" in out
+    assert "\nR  \n" in out
+    # Last OCR line must NOT carry the two-space hard break (no stray <br>).
+    assert "10 mm  \n" not in out
+    assert "10 mm\n" in out
+
+
+def test_render_appended_renders_ocr_section_when_ocr_text_present():
+    descriptions = [
+        _desc_with_ocr(
+            name="MM-130-a.jpeg",
+            description="Axial CT.",
+            ocr_text="Slice 24 / 60",
+        ),
+    ]
+    rendered = render_appended_section(descriptions)
+
+    assert "**OCR text:**" in rendered
+    assert "Slice 24 / 60" in rendered
+    assert "Axial CT." in rendered
+
+
+def test_render_omits_ocr_section_when_ocr_text_is_none():
+    descriptions = [_desc(name="Im0", description="A clean radiograph.")]
+    rendered = render_appended_section(descriptions)
+
+    assert "**Embedded image:** `Im0`" in rendered
+    assert "**OCR text:**" not in rendered
+    assert "**Visual description:**" in rendered
+    # No raw HTML / blockquote prefixes.
+    assert "<image" not in rendered
+    assert "> **" not in rendered
+
+
+# ---------------------------------------------------------------------------
+# inject_descriptions_inline: <figure> blocks (layout-aware parsers)
+#
+# Azure Document Intelligence's ``prebuilt-layout`` and LlamaCloud
+# premium both emit ``<figure>...</figure>`` blocks that already contain
+# the parser's own OCR of the figure (chart bar values, axis labels,
+# inline ``<figcaption>``, embedded ``<table>`` for tabular figures).
+# That parser-side content is useful for retrieval on its own, so we
+# PRESERVE the figure verbatim and append our vision-LLM block
+# immediately after rather than substituting for it.
+# ---------------------------------------------------------------------------
+
+
+def test_inject_appends_block_after_figure_preserving_parser_content():
+    """Figure block stays intact; vision-LLM block goes right after it."""
+    markdown = (
+        "Some narrative text.\n\n"
+        "<figure>\n\n"
+        "Republican\n68\nDemocrat\n30\n"
+        "\n</figure>\n\n"
+        "Following paragraph.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Bar chart of party ID.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # Original figure is preserved verbatim -- the parser's OCR'd
+    # numbers must still be searchable.
+    assert "<figure>" in out
+    assert "</figure>" in out
+    assert "Republican" in out and "68" in out
+    # Our vision-LLM block follows the figure, not before / inside it.
+    assert "**Embedded image:** `Im0`" in out
+    assert "Bar chart of party ID." in out
+    figure_close = out.index("</figure>")
+    embedded_at = out.index("**Embedded image:** `Im0`")
+    assert figure_close < embedded_at, "block must be appended AFTER </figure>"
+    # Surrounding narrative is preserved.
+    assert "Some narrative text." in out
+    assert "Following paragraph." in out
+
+
+def test_inject_handles_multiple_figures_in_document_order():
+    """N figures + N descriptions: each pair lands in the right place."""
+    markdown = (
+        "Page 1\n\n<figure>\nChart A bars\n</figure>\n\n"
+        "Between\n\n<figure>\nChart B bars\n</figure>\n\n"
+        "End.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
+                description="Description of chart A.",
+            ),
+            PictureDescription(
+                page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
+                description="Description of chart B.",
+            ),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    # Both figures preserved; both descriptions inlined; order matches.
+    assert out.count("<figure>") == 2
+    assert out.count("</figure>") == 2
+    assert "Description of chart A." in out
+    assert "Description of chart B." in out
+    assert out.index("Description of chart A.") < out.index(
+        "Description of chart B."
+    )
+    # Each description appears AFTER its corresponding </figure>.
+    first_close = out.index("</figure>")
+    assert first_close < out.index("Description of chart A.")
+    second_close = out.index("</figure>", first_close + 1)
+    assert second_close < out.index("Description of chart B.")
+
+
+def test_inject_figures_with_attributes_and_nested_tags():
+    """``<figure>`` with attributes and nested tags is matched and preserved."""
+    markdown = (
+        '<figure id="fig-3" class="chart">\n'
+        '<figcaption>Source: Pew Research</figcaption>\n'
+        "<table><tr><td>Republican</td><td>57</td></tr></table>\n"
+        "</figure>\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Survey table.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # All nested HTML is preserved (chunking will pick it up).
+    assert 'id="fig-3"' in out
+    assert "<figcaption>Source: Pew Research</figcaption>" in out
+    assert "<table>" in out and "Republican" in out and "57" in out
+    # Our block sits after the closing tag.
+    assert out.index("</figure>") < out.index("**Embedded image:** `Im0`")
+
+
+def test_inject_figures_more_descriptions_than_figures_returns_remaining():
+    """Three descriptions, one figure -> one inlined, two left for caller."""
+    markdown = "Text.\n<figure>\nbar values\n</figure>\nMore.\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First desc."),
+            _desc(name="Im1", description="Second desc."),
+            _desc(name="Im2", description="Third desc."),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "First desc." in out
+    # Leftovers are the caller's job; inject_descriptions_inline does
+    # not append them on its own.
+    assert "Second desc." not in out
+    assert "Third desc." not in out
+
+
+def test_inject_figures_more_figures_than_descriptions_leaves_extras_untouched():
+    """Two figures, one description -> first figure enriched, second left raw."""
+    markdown = (
+        "<figure>\nfigure 1 content\n</figure>\n"
+        "<figure>\nfigure 2 content\n</figure>\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Only description.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # Both figures still present; only the first one was enriched.
+    assert out.count("<figure>") == 2
+    assert "Only description." in out
+    # Second figure has no embedded-image block immediately after it.
+    second_open = out.index("<figure>", out.index("<figure>") + 1)
+    second_close = out.index("</figure>", second_open)
+    after_second = out[second_close:]
+    assert "**Embedded image:**" not in after_second
+
+
+def test_merge_inlines_at_figure_boundary():
+    """Top-level helper does the right thing with figures (no leftover section)."""
+    markdown = "Lead.\n<figure>\nbars\n</figure>\nTrailer.\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Bar chart.")]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    # Inline succeeded -> no appended-section heading.
+    assert "## Image Content" not in out
+    assert "Bar chart." in out
+    assert "<figure>" in out and "</figure>" in out
+
+
+def test_inject_figures_then_falls_through_to_docling_marker():
+    """Mixed-marker doc: figure consumed first, then Docling placeholder.
+
+    Defensive -- single docs are usually one parser's output, but if a
+    pipeline ever stitches two parsers' markdowns together the inliner
+    should still place each description.
+    """
+    markdown = (
+        "<figure>\nChart bars: 50, 40, 30\n</figure>\n\n"
+        "Later in the doc:\n\n"
+        "<!-- image -->\nImage: scan.jpeg\n\n"
+        "End.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="Chart description."),
+            _desc(name="Im1", description="Scan description."),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    # Figure preserved + augmented.
+    assert "<figure>" in out and "Chart bars: 50, 40, 30" in out
+    assert "Chart description." in out
+    # Docling placeholder + caption replaced.
+    assert "<!-- image -->" not in out
+    assert "Image: scan.jpeg" not in out
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "Scan description." in out
--- a/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
@ -0,0 +1,146 @@
+"""Unit tests for the vision_llm parser helpers.
+
+Two helpers exist:
+
+- :func:`parse_with_vision_llm` -- single-shot for standalone image
+  uploads (.png/.jpg/etc). Returns combined markdown (description +
+  verbatim OCR mixed) since the image *is* the document.
+- :func:`parse_image_for_description` -- per-image-in-PDF call. Returns
+  visual description only; OCR is the ETL service's job.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# parse_with_vision_llm: legacy single-shot path
+# ---------------------------------------------------------------------------
+
+
+async def test_parse_with_vision_llm_returns_combined_markdown(tmp_path):
+    """Standalone image uploads still go through the combined-markdown path."""
+    from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "# A scan of something."
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    out = await parse_with_vision_llm(str(img), "scan.png", fake_llm)
+    assert out == "# A scan of something."
+    fake_llm.ainvoke.assert_awaited_once()
+
+
+async def test_parse_with_vision_llm_rejects_empty_response(tmp_path):
+    """An empty model response raises rather than silently returning blanks."""
+    from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = ""
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    with pytest.raises(ValueError, match="empty content"):
+        await parse_with_vision_llm(str(img), "scan.png", fake_llm)
+
+
+# ---------------------------------------------------------------------------
+# parse_image_for_description: per-image-in-PDF, description only
+# ---------------------------------------------------------------------------
+
+
+async def test_parse_image_for_description_returns_description(tmp_path):
+    """Description-only path returns the model's markdown unchanged."""
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "Axial CT showing a large cystic mass."
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    out = await parse_image_for_description(str(img), "scan.png", fake_llm)
+    assert out == "Axial CT showing a large cystic mass."
+
+
+async def test_parse_image_for_description_uses_description_only_prompt(tmp_path):
+    """The prompt explicitly tells the model NOT to transcribe text.
+
+    This is the contract that lets us drop OCR from the response: the
+    ETL pipeline already has the text (from page-level OCR), so asking
+    the vision LLM for it would be redundant cost.
+    """
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "A description"
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    await parse_image_for_description(str(img), "scan.png", fake_llm)
+
+    # The prompt is the first text part of the message we sent.
+    sent_messages = fake_llm.ainvoke.call_args.args[0]
+    prompt_text = sent_messages[0].content[0]["text"].lower()
+    assert "describe what this image visually depicts" in prompt_text
+    assert "do not transcribe text" in prompt_text
+
+
+async def test_parse_image_for_description_rejects_empty(tmp_path):
+    """Empty response surfaces as ValueError so the caller can skip the image."""
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "   "  # whitespace-only counts as empty
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    with pytest.raises(ValueError, match="empty content"):
+        await parse_image_for_description(str(img), "scan.png", fake_llm)
+
+
+# ---------------------------------------------------------------------------
+# Image size + extension validation (shared by both paths)
+# ---------------------------------------------------------------------------
+
+
+def test_image_to_data_url_rejects_oversized(tmp_path):
+    """Images larger than 5 MB raise before any LLM call is made."""
+    from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
+
+    big = tmp_path / "huge.png"
+    big.write_bytes(b"\x89PNG" + b"\x00" * (6 * 1024 * 1024))
+
+    with pytest.raises(ValueError, match="Image too large"):
+        _image_to_data_url(str(big))
+
+
+def test_image_to_data_url_rejects_unsupported_extension(tmp_path):
+    """Unknown extensions raise rather than guessing a MIME type."""
+    from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
+
+    weird = tmp_path / "scan.xyz"
+    weird.write_bytes(b"\x00" * 100)
+
+    with pytest.raises(ValueError, match="Unsupported image extension"):
+        _image_to_data_url(str(weird))
--- a/surfsense_desktop/package.json
+++ b/surfsense_desktop/package.json
@ -21,7 +21,7 @@
    "email": "rohan@surfsense.com"
  },
  "license": "MIT",
-  "packageManager": "pnpm@10.24.0",
+  "packageManager": "pnpm@10.26.0",
  "devDependencies": {
    "@electron/rebuild": "^4.0.3",
    "@types/node": "^25.5.0",
--- a/surfsense_evals/.env.example
+++ b/surfsense_evals/.env.example
@ -0,0 +1,83 @@
+# surfsense_evals — environment template.
+#
+# Copy this file to `.env` (in the surfsense_evals/ project root or your
+# CWD) and fill in the values. `python-dotenv` loads it automatically
+# the first time `core.config` is imported, so every CLI subcommand
+# (`setup`, `ingest`, `run`, `report`, `teardown`, `models list`, …)
+# will pick the values up.
+#
+#   cp .env.example .env
+#   # then edit .env with your values
+#
+# `.env` is gitignored — never commit real secrets.
+
+# ---------------------------------------------------------------------------
+# 1. Backend target — REQUIRED (default works for a local dev backend)
+# ---------------------------------------------------------------------------
+SURFSENSE_API_BASE=http://localhost:8000
+
+# ---------------------------------------------------------------------------
+# 2. OpenRouter — REQUIRED for any `run` invocation
+# ---------------------------------------------------------------------------
+# The `native_pdf` arm calls OpenRouter directly; the `surfsense` arm
+# routes through SurfSense which uses the same key under the hood.
+OPENROUTER_API_KEY=sk-or-...
+
+# Override only if you proxy OpenRouter through a private gateway:
+# OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
+
+# Multimodal benchmarks (medxpertqa, mmlongbench) require a vision-capable
+# slug. Recommended (verify in your catalog with `models list --grep ...`):
+#   anthropic/claude-sonnet-4.5    (default recommendation)
+#   anthropic/claude-opus-4.7      (strongest)
+#   openai/gpt-5                   (top-tier vision)
+#   google/gemini-2.5-pro          (1M-token context, best for long PDFs)
+# DO NOT use openai/gpt-5.4-mini for image-bearing benchmarks — it's
+# text-only on PDF content and the runner emits a warning if pinned.
+
+# ---------------------------------------------------------------------------
+# 3. Auth — pick EXACTLY ONE of the two modes below
+# ---------------------------------------------------------------------------
+
+# --- Mode A: LOCAL (backend started with AUTH_TYPE=LOCAL)
+# The harness POSTs these to /auth/jwt/login automatically.
+# SURFSENSE_USER_EMAIL=you@example.com
+# SURFSENSE_USER_PASSWORD=...
+
+# --- Mode B: GOOGLE OAuth (or any pre-issued JWT)
+# Open the SurfSense web UI in your browser, log in via Google, then in
+# DevTools → Application → Local Storage copy:
+#   surfsense_bearer_token  → SURFSENSE_JWT
+#   surfsense_refresh_token → SURFSENSE_REFRESH_TOKEN  (optional, enables
+#                                                       auto-refresh on 401)
+# SURFSENSE_JWT=eyJhbGciOi...
+# SURFSENSE_REFRESH_TOKEN=eyJhbGciOi...
+
+# ---------------------------------------------------------------------------
+# 4. Filesystem paths — OPTIONAL (defaults below)
+# ---------------------------------------------------------------------------
+# Where datasets, rendered PDFs, ingestion id maps, run outputs, and
+# state.json live. Default: <surfsense_evals>/data/
+# EVAL_DATA_DIR=./data
+
+# Where generated reports (summary.md / summary.json) get written.
+# Default: <surfsense_evals>/reports/
+# EVAL_REPORTS_DIR=./reports
+
+# ---------------------------------------------------------------------------
+# 5. Parser SDKs — REQUIRED for the multimodal_doc / parser_compare suite
+# ---------------------------------------------------------------------------
+# parser_compare calls Azure Document Intelligence and LlamaParse SDKs
+# directly from the eval harness so each (basic / premium) extraction
+# is a clean A/B test independent of the SurfSense backend's ETL routing.
+#
+# Azure Document Intelligence — used for the `azure_basic_lc` (prebuilt-read)
+# and `azure_premium_lc` (prebuilt-layout) arms. Get an endpoint + key from
+# https://portal.azure.com (Document Intelligence resource, F0 / S0 tier).
+# AZURE_DI_ENDPOINT=https://<your-resource>.cognitiveservices.azure.com/
+# AZURE_DI_KEY=<your-32-char-key>
+#
+# LlamaCloud (LlamaParse) — used for `llamacloud_basic_lc` (parse_page_with_llm)
+# and `llamacloud_premium_lc` (parse_page_with_agent). Get a key from
+# https://cloud.llamaindex.ai/api-key.
+# LLAMA_CLOUD_API_KEY=llx-...
--- a/surfsense_evals/.gitignore
+++ b/surfsense_evals/.gitignore
@ -0,0 +1,35 @@
+# Python bytecode + caches
+__pycache__/
+*.py[cod]
+*.pyo
+
+# Editable-install / build artifacts
+*.egg-info/
+build/
+dist/
+.eggs/
+
+# Virtual envs (uv venv default + common alternates)
+.venv/
+venv/
+env/
+
+# Tooling caches
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+.coverage
+.coverage.*
+htmlcov/
+
+# Local secrets — keep `.env.example` tracked, never the real `.env`.
+.env
+.env.local
+.env.*.local
+!.env.example
+
+# Run / debug logs — keep ephemeral, structured artifacts in
+# `data/.../<run_id>/` are the citation surface, not these.
+*.log
+logs_*.txt
+retry_run.log
--- a/surfsense_evals/README.md
+++ b/surfsense_evals/README.md
@ -0,0 +1,228 @@
+# SurfSense Evals
+
+Domain-agnostic eval harness for SurfSense. Each benchmark is a Python subpackage under `suites/<domain>/<benchmark>/` that self-registers with the CLI; `core/` is the shared infrastructure (HTTP clients, arms, parsers, metrics, report writer, registry). The harness talks to SurfSense over HTTP only — it does **not** import any backend Python module — so it ships in its own venv and never bloats the FastAPI runtime image.
+
+## Benchmarks
+
+| Benchmark                       | Shape                                            | Vision required? | Default ingest             |
+|---------------------------------|--------------------------------------------------|------------------|----------------------------|
+| `medical/medxpertqa` (headline) | Native PDF vs SurfSense head-to-head, MCQ        | yes              | `vision=on, mode=basic`    |
+| `medical/mirage`                | SurfSense single-arm, MCQ                        | no               | `vision=off, mode=basic`   |
+| `medical/cure`                  | SurfSense single-arm retrieval (Recall/MRR/nDCG) | no               | `vision=off, mode=basic`   |
+| `multimodal_doc/mmlongbench`    | Native PDF vs SurfSense head-to-head, open-ended | yes              | `vision=on, mode=basic`    |
+
+Future domains (`legal/`, `finance/`, `code/`, `scientific/`) drop into `suites/` without touching `core/` or the CLI.
+
+## Install + auth
+
+```bash
+uv pip install -e ./surfsense_evals
+cp surfsense_evals/.env.example surfsense_evals/.env
+# Edit .env: SURFSENSE_API_BASE, OPENROUTER_API_KEY, and ONE of:
+#   LOCAL  → SURFSENSE_USER_EMAIL + SURFSENSE_USER_PASSWORD
+#   GOOGLE → SURFSENSE_JWT (+ optional SURFSENSE_REFRESH_TOKEN)
+#            (lift both from browser localStorage after a normal Google login)
+```
+
+## Step-by-step: run all four benchmarks
+
+The medical and multimodal_doc suites each get their own SearchSpace and pinned model, so they're independent — run them in any order. Both head-to-head benchmarks (`medxpertqa`, `mmlongbench`) require a **vision-capable** OpenRouter slug; pinning a text-only one (e.g. `openai/gpt-5.4-mini`) silently drops images and the runner emits a warning.
+
+Recommended vision slugs (use `models list --grep <name>` to confirm one): `anthropic/claude-sonnet-4.5` (balanced cost), `anthropic/claude-opus-4.7` (strongest reasoning), `openai/gpt-5` (top-tier vision), `google/gemini-2.5-pro` (best for long PDFs, 1M-token context).
+
+```bash
+# 0. (optional) discover what's registered
+python -m surfsense_evals suites list
+python -m surfsense_evals benchmarks list
+
+# 1. MEDICAL SUITE — one SearchSpace, three benchmarks
+python -m surfsense_evals setup --suite medical --provider-model anthropic/claude-sonnet-4.5
+
+#  1a. headline head-to-head: Native PDF (vision) vs SurfSense (vision RAG)
+#      Downloads dev+test JSONL + images.zip, renders one PDF per question
+#      (case + table + images + 5 options), uploads with use_vision_llm=True.
+python -m surfsense_evals ingest medical medxpertqa --split test
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+
+#  1b. MIRAGE — single-arm SurfSense MCQ accuracy
+#      (MMLU-Med / MedQA-US / MedMCQA / PubMedQA / BioASQ)
+python -m surfsense_evals ingest medical mirage
+python -m surfsense_evals run    medical mirage
+
+#  1c. CUREv1 — single-arm SurfSense retrieval (Recall@k / MRR / nDCG@10)
+python -m surfsense_evals ingest medical cure --lang en
+python -m surfsense_evals run    medical cure --lang en
+
+#  1d. write reports/medical/<UTC-ts>/summary.{md,json}
+python -m surfsense_evals report --suite medical
+
+# 2. MULTIMODAL_DOC SUITE — long PDFs with embedded images, charts, tables
+python -m surfsense_evals setup  --suite multimodal_doc --provider-model google/gemini-2.5-pro
+python -m surfsense_evals ingest multimodal_doc mmlongbench           # ~660MB, resumable
+python -m surfsense_evals run    multimodal_doc mmlongbench --concurrency 4
+python -m surfsense_evals report --suite multimodal_doc
+
+# 3. CLEANUP — soft-deletes the SearchSpaces; rendered PDFs stay cached
+python -m surfsense_evals teardown --suite medical
+python -m surfsense_evals teardown --suite multimodal_doc
+```
+
+## Asymmetric scenarios — the "vision-extract once, answer cheap" play
+
+The walkthrough above is `--scenario head-to-head` (default): both arms answer with the same vision-capable slug. SurfSense's actual architectural value-prop is that the **ingestion-time vision LLM and the runtime LLM are completely independent** — you can pay a vision LLM *once*, at ingest, to convert every embedded image into text (per-image OCR **and** semantic description, inlined where the image actually appears in the document — see [What `--use-vision-llm` produces](#what---use-vision-llm-produces) below). Then every query is served by a cheap text-only model that sees that extracted text natively. Two extra scenarios make this explicit:
+
+| `--scenario`       | Native arm answers with                | SurfSense arm answers with     | Question being measured                                                                  |
+|--------------------|----------------------------------------|--------------------------------|------------------------------------------------------------------------------------------|
+| `head-to-head`     | `--provider-model` (vision)            | `--provider-model` (vision)    | Pure RAG quality at parity. (Default.)                                                   |
+| `symmetric-cheap`  | `--provider-model` (cheap, text-only)  | `--provider-model` (same)      | Does pre-extracted image context let a non-vision LLM reason over image-heavy docs?      |
+| `cost-arbitrage`   | `--native-arm-model` (vision)          | `--provider-model` (cheap)     | How close does SurfSense get to a vision-native baseline at a fraction of per-query cost?|
+
+In all three modes the **ingest-time** vision LLM is set on the SearchSpace's `vision_llm_config_id` (auto-picked from the strongest registered global OpenRouter vision config — `claude-sonnet-4.5` > `claude-opus-4.7` > `gpt-5` > `gemini-2.5-pro`, override with `--vision-llm <slug>`). What changes is which slug the *answering* models hit per arm.
+
+### Ingest with vision, evaluate with a non-vision LLM (`symmetric-cheap`)
+
+This is the answer to *"does SurfSense give a non-vision LLM enough context to reason over image-heavy docs?"*. Both arms hit the same cheap text-only slug. The native arm is structurally blind to images (text-only LLM + raw PDFs). The SurfSense arm reads chunks that already contain the per-image OCR and visual descriptions, written there by the vision LLM at ingest time.
+
+```bash
+python -m surfsense_evals setup --suite medical \
+  --scenario symmetric-cheap \
+  --provider-model openai/gpt-5.4-mini
+  # vision LLM at ingest = auto-picked (claude-sonnet-4.5 by default)
+  # answer LLM for BOTH arms = openai/gpt-5.4-mini (text-only)
+
+python -m surfsense_evals ingest medical medxpertqa --split test  # vision=on by default
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+python -m surfsense_evals report --suite medical
+# Δ accuracy on image-required MCQs is the headline number; native arm
+# baseline is "what a text-only LLM gets without seeing the images".
+```
+
+### Cheap SurfSense vs vision-native baseline (`cost-arbitrage`)
+
+```bash
+python -m surfsense_evals setup --suite medical \
+  --scenario cost-arbitrage \
+  --provider-model openai/gpt-5.4-mini \
+  --native-arm-model anthropic/claude-sonnet-4.5
+  # vision LLM at ingest = auto-picked claude-sonnet-4.5
+  # native arm = sonnet (vision); SurfSense arm = gpt-5.4-mini (text-only)
+
+python -m surfsense_evals ingest medical medxpertqa --split test
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+python -m surfsense_evals report --suite medical
+# Report header reads:
+#   Scenario: cost-arbitrage — native arm answers with `anthropic/claude-sonnet-4.5`
+#   (vision); SurfSense answers with `openai/gpt-5.4-mini` over chunks vision-extracted
+#   at ingest by `anthropic/claude-sonnet-4.5`.
+```
+
+Notes:
+- `cost-arbitrage` requires both `--provider-model` (the cheap SurfSense slug) AND `--native-arm-model <vision slug>`.
+- `--vision-llm <slug>` is optional; if omitted the harness queries `GET /api/v1/global-vision-llm-configs` and auto-picks the strongest registered one. Pass `--no-vision-llm-setup` if you want to keep whatever vision config is already attached to the SearchSpace.
+- The runner's "looks text-only" warning is suppressed (or relabelled as informational) for `symmetric-cheap` so intentional asymmetry doesn't read as a misconfiguration.
+- All three scenario fields (`scenario`, `provider_model`, `native_arm_model`, `vision_provider_model`) are persisted to `state.json` and recorded in `run_artifact.extra` + the report header — no need to retrace what was set.
+
+## Per-benchmark useful flags
+
+`medical/medxpertqa` (`run`):
+- `--split {test,dev,all}` — pick a subset (default `test`)
+- `--task "Diagnosis"` / `--body-system "Cardiovascular"` — slice the report
+- `--require-images` — drop rare rows where every image filename failed to resolve
+- `--n 100` — quick smoke run
+- `--no-mentions` — let SurfSense retrieve unscoped ("did the @-mention matter?")
+
+`multimodal_doc/mmlongbench`:
+- `--max-docs N` (ingest) — cap downloads at the first N unique PDFs
+- `--format {str,int,float,list,none}` (run) — slice by answer format; `none` = the ~22% intentionally unanswerable hallucination probes
+- `--skip-unanswerable` (run) — drop unanswerable questions
+- `--docs <a.pdf>,<b.pdf>` (run) — scope to specific docs
+
+## Ingestion knobs (vision LLM, processing mode, summarize)
+
+The harness exposes `POST /api/v1/documents/fileupload`'s three knobs on every `ingest` subcommand:
+
+| Flag pair                                  | Effect                                                                                  |
+|--------------------------------------------|-----------------------------------------------------------------------------------------|
+| `--use-vision-llm` / `--no-vision-llm`     | Walk every embedded image in the PDF and inline image-derived text at the image's position (see below). |
+| `--processing-mode {basic,premium}`        | `premium` carries a 10× page multiplier and routes to a stronger ETL (e.g. LlamaCloud). |
+| `--should-summarize` / `--no-summarize`    | Generate a per-document summary at ingest.                                              |
+
+The "Default ingest" column in the benchmarks table is what runs if you don't pass any flag. Whatever was actually used is recorded as a `__settings__` header in the doc map (`data/<suite>/maps/<benchmark>_*_map.jsonl`) and as `extra.ingest_settings` in `run_artifact.json`, then surfaced in the report — no need to hunt through CLI history.
+
+> The backend's `ETL_SERVICE` env var (`DOCLING` | `UNSTRUCTURED` | `LLAMACLOUD`) is **not** per-upload. Restart the backend with a different `ETL_SERVICE` and re-ingest to compare ETLs (route through `--processing-mode premium` if your backend uses that mode for the stronger ETL).
+
+### What `--use-vision-llm` produces
+
+When vision is on, the backend's ETL pipeline (`app/etl_pipeline/picture_describer.py`) does, **per embedded image** in the PDF:
+
+1. Extract the raw image bytes via `pypdf` (deduped by sha256, size-capped to match the vision LLM's per-image limit).
+2. **Per-image OCR** — re-feed the image as a standalone upload through the configured ETL service (Docling / Azure DI / LlamaCloud) with `vision_llm=None`, so the ETL's OCR engine extracts the literal text-in-image.
+3. **Visual description** — call the vision LLM on the image with a description-only prompt (it's explicitly told *not* to transcribe text — that's OCR's job). Steps 2 and 3 run in parallel per image.
+4. Splice a horizontal-rule-delimited section **at the image's original position** in the parser markdown (replacing Docling's `<!-- image -->` placeholder + caption, or the bare `Image: <name>` caption a stripped-image parser leaves behind):
+
+   ```markdown
+   ---
+
+   **Embedded image:** `MM-130-a.jpeg`
+
+   **OCR text:**
+   Slice 24 / 60
+   L  R
+
+   **Visual description:**
+
+   - Axial contrast-enhanced CT showing a large cystic mass in the left upper quadrant.
+   - Mass effect on the adjacent stomach; left kidney displaced inferiorly.
+
+   ---
+   ```
+
+This is what makes `--scenario symmetric-cheap` and `--scenario cost-arbitrage` work: a non-vision LLM reading SurfSense's chunks sees the image's text and semantic content as plain markdown, alongside the surrounding case text, in the same retrieved chunk. Without it the cheap LLM would have nothing extra to read.
+
+### A/B testing the same corpus with different settings
+
+SurfSense dedupes uploads by `(filename, search_space_id)` — **not** by content hash and **not** by ingestion settings. Re-uploading the same filename to the same SearchSpace with a different `--use-vision-llm` flag silently skips re-processing. Give each variant its own SearchSpace:
+
+```bash
+# Baseline arm (vision off)
+python -m surfsense_evals setup    --suite medical --provider-model anthropic/claude-sonnet-4.5
+python -m surfsense_evals ingest   medical medxpertqa --no-vision-llm
+python -m surfsense_evals run      medical medxpertqa --n 100
+python -m surfsense_evals teardown --suite medical
+
+# Vision arm (the benchmark default)
+python -m surfsense_evals setup    --suite medical --provider-model anthropic/claude-sonnet-4.5
+python -m surfsense_evals ingest   medical medxpertqa
+python -m surfsense_evals run      medical medxpertqa --n 100
+python -m surfsense_evals report   --suite medical
+```
+
+Both runs land in `data/medical/runs/<ts>/medxpertqa/` with their settings recorded; rendered PDFs stay cached under `data/medical/medxpertqa/pdfs/` so the second `ingest` is upload-only.
+
+## Environment variables
+
+- `SURFSENSE_API_BASE` (default `http://localhost:8000`)
+- `OPENROUTER_API_KEY` — required for the `native_pdf` arm and for `models list`
+- One of `SURFSENSE_USER_EMAIL` + `SURFSENSE_USER_PASSWORD` (LOCAL), **or** `SURFSENSE_JWT` (+ optional `SURFSENSE_REFRESH_TOKEN`) for GOOGLE/pre-issued JWT
+- `EVAL_DATA_DIR` (default `<project>/data`) — datasets, rendered PDFs, ingestion id maps, run outputs, `state.json`
+- `EVAL_REPORTS_DIR` (default `<project>/reports`)
+- `OPENROUTER_BASE_URL` (default `https://openrouter.ai/api/v1`) — only if you proxy OpenRouter
+
+## Adding a new domain suite
+
+1. Create `surfsense_evals/src/surfsense_evals/suites/<domain>/<benchmark>/` with `__init__.py`, `ingest.py`, `runner.py`, optional `prompt.py`.
+2. Implement a `Benchmark` subclass (see `core/registry.py`); compose with `core.clients.*`, `core.arms.*`, `core.parse.*`, `core.metrics.*`.
+3. Call `register(MyBenchmark())` at the bottom of `<benchmark>/__init__.py`. Auto-discovery picks it up; `setup --suite <domain>` and `ingest/run <domain> <benchmark>` work immediately.
+
+Each suite gets its own SearchSpace (`eval-<suite>-<UTC-ts>`), `state.json` slot, data dir, reports dir, and pinned LLM. Suites never share a SearchSpace.
+
+## Out of scope (follow-up PRs)
+
+- Docker service for `docker compose run evals run medical medxpertqa`.
+- Multi-model sweeps (one slug per `setup` for now; aggregate reports come later).
+- A long-context-stuffing arm (give the model the same retrieved chunks SurfSense saw).
+- LLM-judge grader for MMLongBench-Doc (paper uses GPT-4 as judge; we ship a deterministic rule-based grader).
+- MedXpertQA-MM accuracy by image modality — dataset doesn't tag modality directly; we slice by `medical_task` and `body_system`.
+- A `--slot <name>` flag that decouples the state-slot key from the benchmark registry's `suite` attribute, so parallel SearchSpaces with different ingestion settings can coexist on the same benchmark without `teardown` between A/B arms.
+
+See `c:/Users/91882/.cursor/plans/medical_rag_evals_(mirage_+_curev1)_e797a324.plan.md` for the full design rationale.
--- a/surfsense_evals/data/.gitignore
+++ b/surfsense_evals/data/.gitignore
@ -0,0 +1,22 @@
+# Default: don't track anything under data/ (large benchmarks, PDFs,
+# extracted markdown, ingestion caches, per-run artifacts can balloon).
+*
+
+# Always keep this gitignore file
+!.gitignore
+
+# Whitelist the artifacts the n=171 multimodal_doc / parser_compare blog
+# report (`reports/blog/multimodal_doc_parser_compare_n171_report.md`)
+# cites by path. These are the verifiable numbers source for the report.
+#
+# Path components have to be unblocked one level at a time because git
+# does not look into a directory whose parent is ignored.
+!multimodal_doc/
+!multimodal_doc/runs/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
@ -0,0 +1,37 @@
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:15Z", "latency_ms": 20985, "error": null, "raw_text_chars": 696}]}, "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about how many adults rated Trump's government ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nLooking through the document, I find relevant information on page 4 (document page 6):\n\nThe document states: \"About four-in-ten Americans rate the ethical standards of Trump administration officials as excellent (9%) or good (30%), while 58% say they are not good (21%) or poor (36%).\"\n\nThe survey was conducted among 1,503 adults (as stated in the methodology section on page 14).\n\nTo find the number of adults who rated the ethical standards as \"poor\":\n36% of 1,503 = 0.36 \u00d7 1,503 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 33894, "output_tokens": 191, "cost_micros": 104547, "latency_ms": 20968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:16Z", "latency_ms": 20453, "error": null, "raw_text_chars": 544}]}, "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at page 5 of the document, which shows \"Public's confidence in Trump to handle several policy areas mixed,\" I can see the data for \"very confident\" and \"not at all confident\" percentages across different domains.\n\nFor \"Very confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 30%\n\nFor \"Not at all confident\" (highest percentage):\n- Make wise decisions about immigration policy: 55%\n\nAnswer: Make wise decisions about immigration policy, Negotiate favorable trade agreements with other countries", "citations": [], "input_tokens": 33942, "output_tokens": 117, "cost_micros": 103581, "latency_ms": 20438, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, negotiate favorable trade agreements with other countries", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 41750, "error": null, "raw_text_chars": 1479}]}, "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I need to count the pictures in the document that contain only one person.\n\nLet me go through the document page by page:\n\nPage 1:\n- Photo of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. (Ret.) Tim Ziemer - 2 people\n\nPage 2:\n- Photo of Capt. John W. Sanders - 1 person\n- Photo of Capt. Steven M. Wechsler - 1 person\n\nPage 3:\n- Photo of Afghan trainees in laboratory - multiple people\n\nPage 4:\n- Group photo of Pacific Partnership 2012 Internal Medicine Department staff - multiple people\n- Photo of Cmdr. Charmagne Beckett on flight deck - 1 person\n\nPage 5:\n- Photo of Petty Officer 1st Class Dennis Gonzales - 1 person\n\nPage 6:\n- Photo of Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n\nPage 7:\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Buhari Oyofo, Dr. Walter T. Gwenigale, Lt. Joseph Diclaro, and Dr. Fatorma Bolay - 5 people\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Chris Martinez, Capt. Buhari Oyofo, Col. Vernon Graham, and Lt. Joseph Diclaro - 5 people\n\nPage 8:\n- Group photo of Kazakh scientists and NMRC staff - multiple people\n\nPage 9:\n- Photo of Lt. j.g. Michael Rucker treating a child - 2 people\n- Photo of U.S. Marines and Sailors in aircraft - multiple people\n\nPage 10:\n- Group photo of Joint Combat Casualty Research Team - multiple people\n\nPage 12:\n- Photo of NMRC 2012 Dining Out Event - multiple people\n- Photo with Rear Adm. Bruce Doll, Capt. John Sanders and others - multiple people\n\nPage 13:\n- Photo with Ellen Flannery, Emmanuel Adu-Gyamfi, Eil", "citations": [], "input_tokens": 40387, "output_tokens": 512, "cost_micros": 128841, "latency_ms": 41735, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "original_error": "JSONDecodeError: Expecting value: line 169 column 1 (char 924)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 36889, "error": null, "raw_text_chars": 1214}]}, "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure, specifically the number of multi-head self-attention layers.\n\nLet me search through the document for information about DETR's architecture.\n\nFrom page 2, I can see that DETR has a transformer encoder-decoder architecture.\n\nOn page 10 (Section 4.1), I find: \"To be comparable in the number of parameters we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nLooking at the detailed architecture description on page 22 (Section A.3) and Figure 10, I can see that:\n- The encoder has multiple layers, each containing multi-head self-attention\n- The decoder has multiple layers, each containing multi-head self-attention AND multi-head attention (cross-attention)\n\nFrom page 9-10, the baseline model uses \"6 encoder, 6 decoder layers\"\n\nEach encoder layer has one multi-head self-attention layer.\nEach decoder layer has one multi-head self-attention layer and one multi-head cross-attention layer.\n\nSo the total number of multi-head self-attention layers (not counting cross-attention) is:\n- 6 encoder layers \u00d7 1 self-attention per layer = 6\n- 6 decoder layers \u00d7 1 self-attention per layer = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 59866, "output_tokens": 318, "cost_micros": 184368, "latency_ms": 36875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "arm": "llamacloud_basic_lc", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:13Z", "latency_ms": 11328, "error": null, "raw_text_chars": 1376}]}, "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine the document for bar plots.\n\nLooking through the figures in the document:\n\n1. **Figure 1**: Shows example images with text descriptions - no bar plots\n2. **Figure 2**: Shows data format comparison with images and text - no bar plots\n3. **Figure 3**: Shows the Syphus overview diagram - no bar plots\n4. **Figure 4**: This appears to show statistics and analysis. Let me examine it carefully:\n   - Part (a): Shows verb-noun structure for instructions - this appears to be a bar chart\n   - Part (b): Shows verb-noun structure for responses - this appears to be a bar chart\n   - Part (c): Shows \"Diversity in our dataset\" with what appears to be distribution plots\n\nLooking at Figure 4 more carefully:\n- (a) contains what looks like a horizontal bar chart showing verbs and nouns from instructions\n- (b) contains what looks like a horizontal bar chart showing verbs and nouns from responses\n- (c) contains multiple distribution plots showing lengths and counts\n\n5. **Figure 5**: Shows response examples - no bar plots\n6. **Figure 6**: The document mentions Fig. 6(b) and Fig. 6(c) but these are not fully visible in the provided pages\n\nFrom what I can see clearly in Figure 4:\n- Subfigure (a): 1 bar plot (showing instruction verb-noun structure)\n- Subfigure (b): 1 bar plot (showing response verb-noun structure)\n\nTotal visible bar plots: 2\n\nAnswer: 2", "citations": [], "input_tokens": 20240, "output_tokens": 354, "cost_micros": 66030, "latency_ms": 11328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "6"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:18Z", "latency_ms": 52514, "error": null, "raw_text_chars": 860}]}, "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll examine pages 5-10 to count the tables.\n\nPage 5: No tables visible in the text content shown.\n\nPage 6: Table 1 is present - \"Llama 2 family of models\" showing training data, parameters, context length, GQA, tokens, and LR for Llama 1 and Llama 2 models.\n\nPage 7: \n- Table 2: \"CO2 emissions during pretraining\" showing time, power consumption, and carbon emitted for different Llama 2 models.\n- Table 3: \"Overall performance on grouped academic benchmarks\" comparing MPT, Falcon, Llama 1, and Llama 2 models.\n\nPage 8:\n- Table 4: \"Comparison to closed-source models on academic benchmarks\" showing results for GPT-3.5, GPT-4, PaLM, PaLM-2-L, and Llama 2.\n- Table 5: \"SFT annotation\" showing examples of helpfulness and safety annotations.\n\nPage 9: No complete tables visible in the shown content.\n\nPage 10: No tables visible in the shown content.\n\nAnswer: 5", "citations": [], "input_tokens": 199213, "output_tokens": 280, "cost_micros": 601839, "latency_ms": 52468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:25Z", "latency_ms": 40452, "error": null, "raw_text_chars": 575}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 on page 4, which shows \"Safety human evaluation results for Llama 2-Chat compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is better for safety).\n\nFrom the bar chart showing \"Violation % (lower is safer)\":\n- Llama 2-70b-chat has the lowest violation percentage (approximately 0-5%)\n- Llama 2-13b-chat has the second lowest (approximately 0-5%)\n- Llama 2-34b-chat has the third lowest (approximately 5-10%)\n\nAnswer: Llama 2-70b-chat, Llama 2-13b-chat, Llama 2-34b-chat", "citations": [], "input_tokens": 199224, "output_tokens": 182, "cost_micros": 600402, "latency_ms": 40406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:05Z", "latency_ms": 10343, "error": null, "raw_text_chars": 376}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I can see the violation percentages (lower is safer):\n\nThe top-3 safest models (lowest violation %) are:\n1. Llama-2 7b-chat: 3%\n2. Llama-2 13b-chat: 3%\n3. Llama-2 70b-chat: 4%\n\nAnswer: Llama-2 7b-chat, Llama-2 13b-chat, Llama-2 70b-chat", "citations": [], "input_tokens": 105645, "output_tokens": 151, "cost_micros": 319200, "latency_ms": 10343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 70b chat, llama 2 7b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:10Z", "latency_ms": 29391, "error": null, "raw_text_chars": 525}]}, "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the QUAC (f1) 0-shot column.\n\nFrom the table, the QUAC (f1) 0-shot scores are:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score is 44.1, which belongs to LLAMA 1 33B.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 105649, "output_tokens": 274, "cost_micros": 321057, "latency_ms": 29391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:15Z", "latency_ms": 43610, "error": null, "raw_text_chars": 650}]}, "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 9 and count the emojis in the left and right subfigures.\n\nLooking through the document, I find Figure 9 on page 16 (document page 16). The figure shows \"Issues with multi-turn memory (left) can be improved with GAtt (right).\"\n\nLooking at the left subfigure:\n- It shows a conversation with text-only responses (no emojis visible)\n\nLooking at the right subfigure:\n- First response: 4 emojis (\ud83d\udc4b\ud83d\udc41\ufe0f\ud83d\udc44\ud83d\udc41\ufe0f)\n- Second response: 4 emojis (\u2708\ufe0f\u2708\ufe0f\ud83d\udeeb\ud83d\udcba)\n- Third response: 4 emojis (\ud83e\uddb4\ud83e\udd95\ud83e\uddb4\ud83d\udd25)\n- Fourth response: 2 emojis (\ud83c\uddee\ud83c\uddf9\ud83c\udf5d\ud83d\udc90)\n\nTotal in right subfigure: 4 + 4 + 4 + 2 = 14 emojis\n\nTotal in left subfigure: 0 emojis\n\nDifference: 14 - 0 = 14\n\nAnswer: 14", "citations": [], "input_tokens": 199223, "output_tokens": 267, "cost_micros": 601674, "latency_ms": 43578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "15"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "original_error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:40Z", "latency_ms": 43593, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:53:24Z", "latency_ms": 44391, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:54:10Z", "latency_ms": 35797, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:54:48Z", "latency_ms": 42266, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:55:41Z", "latency_ms": 31983, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 31843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:59Z", "latency_ms": 33250, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:53:33Z", "latency_ms": 40889, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:54:16Z", "latency_ms": 51515, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:55:10Z", "latency_ms": 54500, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:56:14Z", "latency_ms": 32641, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:13Z", "latency_ms": 42110, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:56:56Z", "latency_ms": 52202, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:57:50Z", "latency_ms": 40469, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:58:34Z", "latency_ms": 49766, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:59:30Z", "latency_ms": 42110, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 41968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "azure_premium_lc", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:47Z", "latency_ms": 24578, "error": null, "raw_text_chars": 640}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and identify the price of the keyboard shown in the screenshot.\n\nLooking through the document, I can find Figure 111 which is part of Section 9.8 on GUI Navigation, specifically about Online Shopping for an Ergonomic Keyboard.\n\nIn Figure 111, the screenshot shows a shopping cart with a keyboard item. The text indicates:\n\n\"Subtotal $4999\"\n\nAnd in the cart details, it shows:\n\"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...\n$4999\"\n\nThe price shown is $49.99 (represented as $4999 in the format used in the screenshot, which appears to be missing the decimal point).\n\nAnswer: 49.99", "citations": [], "input_tokens": 133648, "output_tokens": 176, "cost_micros": 403584, "latency_ms": 24578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:57:11Z", "latency_ms": 69062, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:58:21Z", "latency_ms": 35639, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:58:59Z", "latency_ms": 51312, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:59:54Z", "latency_ms": 49218, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:00:48Z", "latency_ms": 56375, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "azure_basic_lc", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:12Z", "latency_ms": 7967, "error": null, "raw_text_chars": 594}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find Figure 107 in the document and identify the battery percentage shown in that screenshot.\n\nLooking through the document for Figure 107, I can see it's in Section 9.8 on GUI Navigation, specifically related to online shopping.\n\nIn the image caption list, Figure 107 is described as \"Section 9.8: online shopping.\"\n\nLooking at the actual Figure 107 in the document, I can see it's a smartphone screenshot showing an Amazon shopping interface. At the top of the screenshot, there is a time display \"5:32\" and next to it shows \"75\" which represents the battery percentage.\n\nAnswer: 75", "citations": [], "input_tokens": 117429, "output_tokens": 143, "cost_micros": 354432, "latency_ms": 7967, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "75.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:20Z", "latency_ms": 33344, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:00:54Z", "latency_ms": 43733, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:01:39Z", "latency_ms": 33390, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:02:16Z", "latency_ms": 33641, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:02:59Z", "latency_ms": 38000, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:01:45Z", "latency_ms": 35218, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:02:21Z", "latency_ms": 44921, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:03:08Z", "latency_ms": 52842, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:04:05Z", "latency_ms": 27250, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:04:41Z", "latency_ms": 48031, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 47889, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "jooodeee"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:37Z", "latency_ms": 20843, "error": null, "raw_text_chars": 345}]}, "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically on page 2, the task formulation section discusses the \"Setting of Conscious Incompetence.\"\n\nThe relevant text states: \"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph G.\"\n\nAnswer: if it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 60205, "output_tokens": 80, "cost_micros": 181815, "latency_ms": 20828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "if it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:57Z", "latency_ms": 14639, "error": null, "raw_text_chars": 1219}]}, "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to identify which modules use Large Language Models (LLMs) as their backbone.\n\nFrom Section 3, the QACHECK system has 5 modules:\n1. **Claim Verifier** - Section 3.1 states it's \"based on InstructGPT\" (an LLM)\n2. **Question Generator** - Section 3.2 states \"Similar to the claim verifier, we also leverage InstructGPT\" (an LLM)\n3. **Question Answering Model** - Section 3.3 describes three implementations, with the default being \"GPT Reciter-Reader\" which uses InstructGPT (an LLM). However, this module also has non-LLM options (Retriever-Reader, FLAN-T5).\n4. **QA Validator** - Section 3.4 states \"We again implement the validator by prompting InstructGPT\" (an LLM)\n5. **Reasoner** - Section 3.5 provides \"two different implementations: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\"\n\nCounting the modules that use LLM backbones:\n- Claim Verifier: Yes (InstructGPT)\n- Question Generator: Yes (InstructGPT)\n- Question Answering Model: Yes (uses GPT Reciter-Reader by default)\n- QA Validator: Yes (InstructGPT)\n- Reasoner: Yes (InstructGPT is one of the two implementations)\n\nAll 5 modules have LLM implementations, with some offering alternatives.\n\nAnswer: 5", "citations": [], "input_tokens": 27149, "output_tokens": 403, "cost_micros": 87492, "latency_ms": 14639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:12Z", "latency_ms": 6562, "error": null, "raw_text_chars": 406}]}, "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the sociology subject in MMMU.\n\nIn Figure 3, which shows the distribution of subjects across disciplines, I can see:\n\nUnder \"Humanities & Social Sci. (9%)\":\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Psychology (340, 2.94%)\n- **Sociology (287, 2.48%)**\n\nThe number 287 represents the number of samples in the sociology subject.\n\nAnswer: 287", "citations": [], "input_tokens": 103232, "output_tokens": 123, "cost_micros": 311541, "latency_ms": 6562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:19Z", "latency_ms": 62062, "error": null, "raw_text_chars": 760}]}, "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Economics-related error cases that fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics error cases:\n\nFrom the Case Study section (pages 20-21), I can see Table 10 which shows the index of case study figures by subjects with associated error categories.\n\nLooking at the Economics row in Table 10:\n- Economics has: Correct Case (22), Perceptual Error (23, 24)\n\nThis indicates that there are 2 Economics-related error cases in the Perceptual Error category (Figures 23 and 24).\n\nLet me verify by checking the actual figures:\n- Figure 23 (page 37): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n- Figure 24 (page 38): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n\nAnswer: 2", "citations": [], "input_tokens": 259127, "output_tokens": 212, "cost_micros": 1559532, "latency_ms": 62014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_basic_lc", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:21Z", "latency_ms": 5093, "error": null, "raw_text_chars": 174}]}, "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists the subfields of each subject:\n\nFor Energy & Power, the subfields are:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 83153, "output_tokens": 47, "cost_micros": 250164, "latency_ms": 5093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:26Z", "latency_ms": 64485, "error": null, "raw_text_chars": 1513}]}, "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 1 in the document and count the different icons shown as different image types.\n\nLooking at page 1, Figure 1 is titled \"Overview of the MMMU dataset\" and shows four main sections:\n\n1. **Comprehensive Disciplines** - Shows icons/images representing different disciplines (Engineering, Science, Art & Design, Business, Humanities & Social Sci., Medicine)\n\n2. **Heterogeneous Image Types** - Shows various image type icons including:\n   - Diagrams\n   - Tables\n   - Plots and Charts\n   - Photographs\n   - Chemical Structures\n   - Paintings\n   - Medical Images\n   - Sheet Music\n   - Geometric\n   - Pathology images\n   - Microscopic Images\n   - Comics\n\n3. **Interleaved Text and Images** - Shows a medical question example with 3 MRI images\n\n4. **Expert-level Skills Test** - Shows a diagram with three components: Perception, Knowledge, and Reasoning\n\nThe question asks specifically about \"different icon are shown as different image types\" in Figure 1. Looking at the \"Heterogeneous Image Types\" section, I can count the distinct image type icons shown.\n\nCounting the image types listed in the \"Heterogeneous Image Types\" section:\nDiagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, and there's an ellipsis (...) suggesting more types.\n\nBased on what's explicitly visible and labeled in Figure 1, I can count approximately 12 distinct image type representations shown.\n\nAnswer: 12", "citations": [], "input_tokens": 259123, "output_tokens": 378, "cost_micros": 1563243, "latency_ms": 64406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:29Z", "latency_ms": 75843, "error": null, "raw_text_chars": 831}]}, "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts that appear in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Figure 4: A comparison chart showing benchmarks plotted on a 2D graph with \"Breadth (Knowledge)\" on x-axis and \"Depth (Reasoning)\" on y-axis\n- A table comparing different datasets\nTotal: 1 chart\n\n**Page 6:**\n- Table 2: Results table (not a chart)\nTotal: 0 charts\n\n**Page 7:**\n- Figure 5: Bar chart showing \"Performance of models on different types of images\"\n- Table 3: Results table (not a chart)\nTotal: 1 chart\n\n**Page 8:**\n- Figure 6: Pie chart showing \"Error distribution over 150 annotated GPT-4V errors\"\nTotal: 1 chart\n\n**Page 9:**\n- No charts, only text\nTotal: 0 charts\n\n**Page 10:**\n- No charts, only text (references section)\nTotal: 0 charts\n\nTotal charts in pages 5-10: 1 + 0 + 1 + 1 + 0 + 0 = 3\n\nAnswer: 3", "citations": [], "input_tokens": 259120, "output_tokens": 289, "cost_micros": 1561222, "latency_ms": 75796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:30Z", "latency_ms": 77217, "error": null, "raw_text_chars": 404}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 on page 8 of the document, which shows the \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe error categories and their percentages are:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 259144, "output_tokens": 133, "cost_micros": 1557856, "latency_ms": 77171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "azure_premium_lc", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:45Z", "latency_ms": 7750, "error": null, "raw_text_chars": 387}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe pie chart shows the following error categories and their percentages:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 109199, "output_tokens": 125, "cost_micros": 329472, "latency_ms": 7750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:53Z", "latency_ms": 59125, "error": null, "raw_text_chars": 830}]}, "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll search through pages 95-100 for error types mentioned in the case studies.\n\nFrom page 95 (Figure 80 - Architecture and Engineering: Surveying and Mapping):\n- This is a correct case, no error type mentioned\n\nFrom page 96 (Figure 82 - Architecture and Engineering: Civil Engineering):\n- Error Category: Reasoning Error\n\nFrom page 97 (Figure 83 - Computer Science: Data Structure and Algorithm):\n- This is a correct case, no error type mentioned\n\nFrom page 98 (Figure 84 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error, Lack of Knowledge\n\nFrom page 99 (Figure 85 - Computer Science: Operating System):\n- Error Category: Perceptual Error\n\nFrom page 100 (Figure 86 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 259136, "output_tokens": 226, "cost_micros": 1559901, "latency_ms": 59077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:48Z", "latency_ms": 11110, "error": null, "raw_text_chars": 1296}]}, "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases (correct predictions by GPT-4V) in Pages 65-70.\n\nLet me go through each figure in this range:\n\n**Page 65 - Figure 51 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (E) Familial polyposis coli\n- Ground Truth: (E) Familial polyposis coli\n- **SUCCESS**\n\n**Page 66 - Figure 52 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (D) Ruptured berry aneurysm\n- Ground Truth: (D) Ruptured berry aneurysm\n- **SUCCESS**\n\n**Page 67 - Figure 53 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 68 - Figure 54 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 69 - Figure 55 (Clinical Medicine: Heart Disease)**\n- GPT-4V answered (B) Mitral regurgitation\n- Ground Truth: (C) Aortic value regurgitation\n- Error Category: Lack of Knowledge\n- **FAILURE**\n\n**Page 70 - Figure 56 (Diagnostics and Laboratory Medicine: Radiology)**\n- GPT-4V correctly identified (C) Dandy-Walker syndrome\n- Ground Truth: (C) Dandy-Walker syndrome\n- **SUCCESS**\n\nTotal successful cases: 3\n\nAnswer: 3", "citations": [], "input_tokens": 103232, "output_tokens": 424, "cost_micros": 316056, "latency_ms": 11110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:52Z", "latency_ms": 23593, "error": null, "raw_text_chars": 1091}]}, "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the figure with a tree shape and identify which branch has the least leaves.\n\nLooking through the document, I can see Figure 1 on page 2, which shows a \"Technology tree of RAG research\" with a tree-like structure.\n\nExamining this tree diagram, I can see it has a trunk labeled \"Retrieval-Augmented Generation\" and branches extending upward. The main branches appear to be:\n\n1. A left branch (colored in teal/green) showing the evolution from pre-training through fine-tuning stages\n2. A middle branch (colored in orange) labeled \"Instructlens\" \n3. A right branch (colored in blue) showing the inference stage with multiple methods\n\nLooking at the number of \"leaves\" (individual methods/papers) on each branch:\n- The left branch (Pre-training/Fine-tuning) has several leaves\n- The middle branch (Instructlens) appears to have the fewest leaves - it's relatively short with minimal branching\n- The right branch (Inference) has many leaves with extensive branching\n\nThe branch labeled \"Instructlens\" in the middle appears to have the least number of leaves.\n\nAnswer: Instructlens", "citations": [], "input_tokens": 64633, "output_tokens": 269, "cost_micros": 197934, "latency_ms": 23577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "instructlens", "normalised_gold": "pre training"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "arm": "azure_premium_lc", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:59Z", "latency_ms": 7156, "error": null, "raw_text_chars": 752}]}, "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Appendix F \"PSEUDOCODE FOR RETRIEVAL METHODS\":\n\n**Algorithm 1 Tree Traversal Algorithm:**\n- Lines: function declaration, initialization of Scurrent, for layer loop, initialization of topk, for node loop, score calculation, append to top_k, end for, Slayer assignment, Scurrent update, end for, return statement, end function\n- Total: 13 lines\n\n**Algorithm 2 Collapsed Tree Algorithm:**\n- Lines: function declaration, flatten tree, initialization of top_nodes, for node loop, append with dot product, end for, sort top_nodes, result initialization, total_tokens initialization, for node loop, if condition, result append, end if, total_tokens update, end for, return statement, end function\n- Total: 16 lines\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 27211, "output_tokens": 190, "cost_micros": 84483, "latency_ms": 7156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:08:06Z", "latency_ms": 16641, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:08:23Z", "latency_ms": 11860, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:08:36Z", "latency_ms": 12422, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:08:54Z", "latency_ms": 19625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:09:25Z", "latency_ms": 14078, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:08:16Z", "latency_ms": 13922, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:08:30Z", "latency_ms": 25952, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:08:57Z", "latency_ms": 14718, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:09:14Z", "latency_ms": 33452, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:09:54Z", "latency_ms": 22093, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:09:39Z", "latency_ms": 11906, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:09:52Z", "latency_ms": 15921, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:10:10Z", "latency_ms": 21921, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:10:34Z", "latency_ms": 11906, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:10:53Z", "latency_ms": 13686, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:10:16Z", "latency_ms": 14281, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:10:32Z", "latency_ms": 12125, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:10:45Z", "latency_ms": 11952, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:11:01Z", "latency_ms": 20641, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:11:30Z", "latency_ms": 13985, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:11:06Z", "latency_ms": 17157, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"{\\\"message\\\":\\\"Input is too long.\\\"}\",\"provider_name\":\"Amazon Bedrock\",\"is_byok\":false}},\"user_id\":\"user_3CNdnY1vL3Ln9TYRiGAii5kmBvu\"}", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:11:25Z", "latency_ms": 12125, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:11:40Z", "latency_ms": 13734, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:11:59Z", "latency_ms": 15828, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:12:25Z", "latency_ms": 19985, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "original_error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:11:44Z", "latency_ms": 15875, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:12:00Z", "latency_ms": 16625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:12:19Z", "latency_ms": 12937, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:12:35Z", "latency_ms": 14625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:12:57Z", "latency_ms": 12156, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "150k"}}
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
@ -0,0 +1,100 @@
+{
+  "config": {
+    "base_delay": 1.0,
+    "concurrency": 2,
+    "llm_model": "anthropic/claude-sonnet-4.5",
+    "max_attempts": 5,
+    "max_delay": 30.0,
+    "max_output_tokens": 512,
+    "pdf_engine": "native"
+  },
+  "elapsed_s": 1373.6,
+  "n_failed_rows_input": 37,
+  "n_retried": 37,
+  "per_arm": {
+    "azure_basic_lc": {
+      "attempts_distribution": [
+        1
+      ],
+      "recovered": 1,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 1
+    },
+    "azure_premium_lc": {
+      "attempts_distribution": [
+        1,
+        1,
+        1
+      ],
+      "recovered": 3,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 3
+    },
+    "llamacloud_basic_lc": {
+      "attempts_distribution": [
+        1,
+        1
+      ],
+      "recovered": 2,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 2
+    },
+    "llamacloud_premium_lc": {
+      "attempts_distribution": [
+        1,
+        1,
+        1,
+        1
+      ],
+      "recovered": 4,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 4
+    },
+    "native_pdf": {
+      "attempts_distribution": [
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5
+      ],
+      "recovered": 15,
+      "recovery_rate": 0.5555555555555556,
+      "still_failed": 12,
+      "tried": 27
+    }
+  },
+  "raw_retries_path": "data\\multimodal_doc\\runs\\2026-05-14T00-53-19Z\\parser_compare\\raw_retries.jsonl",
+  "run_id": "2026-05-14T00-53-19Z",
+  "totals": {
+    "recovered": 25,
+    "still_failed": 12,
+    "tried": 37
+  }
+}
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
--- a/surfsense_evals/pyproject.toml
+++ b/surfsense_evals/pyproject.toml
@ -0,0 +1,63 @@
+[project]
+name = "surfsense-evals"
+version = "0.1.0"
+description = "Domain-agnostic evaluation harness for SurfSense (medical RAG suite ships first; legal/finance/code suites slot in under suites/)."
+readme = "README.md"
+requires-python = ">=3.12"
+license = { text = "Apache-2.0" }
+authors = [{ name = "SurfSense" }]
+
+dependencies = [
+    "httpx>=0.27.0",
+    "httpx-sse>=0.4.0",
+    "datasets>=2.21.0",
+    "huggingface_hub>=0.24.0",
+    "reportlab>=4.0.0",
+    "Pillow>=10.0.0",
+    "pyarrow>=15.0.0",
+    "pydantic>=2.6.0",
+    "tqdm>=4.66.0",
+    "numpy>=1.26.0",
+    "scikit-learn>=1.4.0",
+    "scipy>=1.12.0",
+    "python-dotenv>=1.0.0",
+    "rich>=13.7.0",
+    "trafilatura>=1.12.0",
+    "pypdf>=5.1.0",
+    "azure-ai-documentintelligence>=1.0.2",
+    "llama-cloud-services>=0.6.25",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.23.0",
+    "respx>=0.21.0",
+    "ruff>=0.5.0",
+]
+
+[project.scripts]
+surfsense-evals = "surfsense_evals.core.cli:main"
+
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["surfsense_evals*"]
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+markers = [
+    "integration: opt-in tests that hit a live SurfSense instance (run with `-m integration`)",
+]
+
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP", "SIM", "ASYNC"]
+ignore = ["E501"]
--- a/surfsense_evals/reports/.gitignore
+++ b/surfsense_evals/reports/.gitignore
@ -0,0 +1,13 @@
+# Default: don't track auto-generated `summary.md` / `summary.json` from
+# every benchmark run — those are derivative of `data/.../runs/<id>/`.
+*
+!.gitignore
+
+# Hand-curated sample report kept as a reference for the medical suite.
+!medical/
+!medical/sample_summary.md
+
+# Hand-curated blog-ready writeups (one per experiment) — these *are*
+# the public citation surface and must travel with the repo.
+!blog/
+!blog/*.md
--- a/surfsense_evals/reports/blog/multimodal_doc_parser_compare_n171_report.md
+++ b/surfsense_evals/reports/blog/multimodal_doc_parser_compare_n171_report.md
--- a/surfsense_evals/scripts/analyze_failure_timing.py
+++ b/surfsense_evals/scripts/analyze_failure_timing.py
@ -0,0 +1,125 @@
+"""Were the SSL failures clustered in time (network blip) or evenly
+distributed (sustained limit)? Group failures by 1-min buckets using
+the run start time and the per-row latency_ms / answer order.
+
+Also: for the one *real* intrinsic failure — the 30MB Anthropic limit
+on 2405.09818v1.pdf::Q007 — print the full error message + raw payload
+sizes so the blog has a clean root cause.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+PDFS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    # 1) SSL clustering: failures by question index per arm
+    by_arm_idx: dict[str, list[tuple[int, str]]] = defaultdict(list)
+    qid_order: dict[str, int] = {}
+    arm_seen_count: dict[str, int] = defaultdict(int)
+    for row in rows:
+        arm = row["arm"]
+        idx = arm_seen_count[arm]
+        arm_seen_count[arm] += 1
+        qid_order[f"{arm}::{row['qid']}"] = idx
+        err = row.get("error") or ""
+        cluster = "ssl" if "SSLError" in err else (
+            "empty" if not (row.get("raw_text") or "").strip() and not err else (
+                "5xx" if "502" in err or "503" in err else (
+                    "size_limit" if "exceeds" in err.lower() and "limit" in err.lower() else (
+                        "other_err" if err else "ok"
+                    )
+                )
+            )
+        )
+        if cluster != "ok":
+            by_arm_idx[arm].append((idx, cluster))
+
+    print("=" * 80)
+    print("SSL/network-error indices per arm (each arm processes 171 questions in")
+    print("order; index = sequential position within that arm). Tight clustering")
+    print("in time = transient blip, even spread = sustained limit.")
+    print("=" * 80)
+    for arm in sorted(by_arm_idx):
+        items = by_arm_idx[arm]
+        if not items:
+            continue
+        idxs = sorted(set(i for i, _ in items))
+        print(f"\n{arm}: {len(items)} failures at indices {idxs}")
+        # show clusters
+        cluster_runs = []
+        cur = [idxs[0]]
+        for i in idxs[1:]:
+            if i - cur[-1] <= 5:  # within 5 questions = same time window
+                cur.append(i)
+            else:
+                cluster_runs.append(cur)
+                cur = [i]
+        cluster_runs.append(cur)
+        print(f"   clusters (gap<=5): {len(cluster_runs)}: {cluster_runs}")
+
+    # 2) The 30MB intrinsic failure — full details
+    print()
+    print("=" * 80)
+    print("Intrinsic failure: 30MB Anthropic input limit on 2405.09818v1.pdf::Q007")
+    print("=" * 80)
+    for row in rows:
+        if row["qid"] == "2405.09818v1.pdf::Q007" and row["arm"] == "native_pdf":
+            err = row.get("error") or ""
+            print(f"  qid: {row['qid']}")
+            print(f"  doc: {row['doc_id']}, pages: {row.get('pages')}")
+            pdf_path = PDFS / row["doc_id"]
+            if pdf_path.exists():
+                size_mb = pdf_path.stat().st_size / (1024 * 1024)
+                print(f"  PDF size on disk: {size_mb:.1f} MB")
+                # base64 inflates ~33%
+                est_b64 = size_mb * 1.33
+                print(f"  estimated base64 wire size: {est_b64:.1f} MB")
+            print(f"  full error: {err[:600]}")
+            break
+
+    # 3) Per-PDF: which PDFs are pathological?
+    print()
+    print("=" * 80)
+    print("Per-PDF failure breakdown across all 6 arms (only PDFs with failures)")
+    print("=" * 80)
+    by_pdf: dict[str, list[dict]] = defaultdict(list)
+    for row in rows:
+        err = row.get("error") or ""
+        empty = not (row.get("raw_text") or "").strip()
+        if err or empty:
+            by_pdf[row["doc_id"]].append({
+                "arm": row["arm"],
+                "qid": row["qid"],
+                "err_kind": (
+                    "ssl" if "SSLError" in err
+                    else "size_limit" if "exceeds" in err.lower() and "limit" in err.lower()
+                    else "5xx" if "502" in err or "503" in err
+                    else "json_decode" if "JSONDecodeError" in err
+                    else "empty" if empty and not err
+                    else "other"
+                ),
+                "pages": row.get("pages"),
+            })
+    for doc, items in sorted(by_pdf.items(), key=lambda x: (-len(x[1]), x[0])):
+        kinds = Counter(i["err_kind"] for i in items)
+        arms = sorted({i["arm"] for i in items})
+        pages = items[0]["pages"]
+        print(f"  {doc}  pages={pages}  failures={len(items)}  arms={arms}")
+        print(f"     kinds: {dict(kinds)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/analyze_failures.py
+++ b/surfsense_evals/scripts/analyze_failures.py
@ -0,0 +1,155 @@
+"""Drill into the parser_compare n=171 raw.jsonl to surface every
+failure, group by arm + PDF, and dump the underlying error strings so
+we can write up a clean failure-mode taxonomy for the blog post.
+
+Outputs (printed to stdout + written to `failures_n171.json`):
+* per-arm failure count and rate
+* per-PDF failure count across all arms (which docs are pathological?)
+* error-string clusters per arm (so we can give human-readable causes)
+* sample failure rows (one per cluster) for the appendix
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any
+
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+OUT = REPO / "scripts" / "failures_n171.json"
+
+
+def _classify(error: str | None, raw_text: str) -> str:
+    """Coarse-grained bucket for an error message."""
+
+    blob = (error or "").lower()
+    if not blob and not raw_text.strip():
+        return "empty_response"
+    if "rate limit" in blob or "429" in blob:
+        return "rate_limit"
+    if "context_length" in blob or "context window" in blob or "too many tokens" in blob:
+        return "context_overflow"
+    if "could not process image" in blob or "invalid image" in blob:
+        return "image_decode_failure"
+    if "could not process pdf" in blob or "invalid_request_error" in blob and "pdf" in blob:
+        return "pdf_decode_failure"
+    if "timeout" in blob or "timed out" in blob:
+        return "timeout"
+    if "5xx" in blob or "internal server error" in blob or "503" in blob or "502" in blob:
+        return "provider_5xx"
+    if "filenotfound" in blob:
+        return "missing_extraction"
+    if "badrequest" in blob:
+        return "provider_400"
+    if blob:
+        return "other_error"
+    return "unknown"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    by_arm_failures: dict[str, list[dict]] = defaultdict(list)
+    by_pdf_failures: dict[str, list[dict]] = defaultdict(list)
+    error_clusters: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
+
+    n_per_arm: dict[str, int] = defaultdict(int)
+    for row in rows:
+        arm = row["arm"]
+        n_per_arm[arm] += 1
+        err = row.get("error")
+        raw_text = row.get("raw_text") or ""
+        if err or not raw_text.strip():
+            cluster = _classify(err, raw_text)
+            entry = {
+                "qid": row["qid"],
+                "doc_id": row["doc_id"],
+                "answer_format": row["answer_format"],
+                "gold": row["gold"],
+                "error": err,
+                "cluster": cluster,
+                "raw_text_len": len(raw_text),
+                "pages": row.get("pages"),
+            }
+            by_arm_failures[arm].append(entry)
+            by_pdf_failures[row["doc_id"]].append({**entry, "arm": arm})
+            error_clusters[arm][cluster].append(entry)
+
+    print("=" * 90)
+    print("Per-arm failure count & rate")
+    print("=" * 90)
+    print(f"{'arm':<25} {'n':>4} {'fail':>5} {'rate%':>6}")
+    for arm in sorted(n_per_arm):
+        f = len(by_arm_failures[arm])
+        n = n_per_arm[arm]
+        print(f"{arm:<25} {n:>4} {f:>5} {f / n * 100:>5.1f}%")
+
+    print()
+    print("=" * 90)
+    print("Failure clusters per arm")
+    print("=" * 90)
+    for arm in sorted(error_clusters):
+        print(f"\n{arm}:")
+        for cluster, items in sorted(error_clusters[arm].items()):
+            print(f"  {cluster:<22} {len(items):>3}")
+            sample = items[0]
+            err_short = (sample["error"] or "")[:200].replace("\n", " ")
+            print(f"     example: {sample['qid']}  doc={sample['doc_id']} pages={sample['pages']}")
+            print(f"     error: {err_short}")
+
+    print()
+    print("=" * 90)
+    print("Per-PDF failure totals (PDFs with >=2 failures)")
+    print("=" * 90)
+    pdf_counts = Counter({pdf: len(rows) for pdf, rows in by_pdf_failures.items()})
+    for pdf, count in pdf_counts.most_common():
+        if count < 2:
+            break
+        arms_failed = sorted({r["arm"] for r in by_pdf_failures[pdf]})
+        pages = by_pdf_failures[pdf][0].get("pages")
+        print(f"  {pdf}  pages={pages}  failures={count}  arms={arms_failed}")
+
+    print()
+    print("=" * 90)
+    print("All native_pdf failures (one row per failure)")
+    print("=" * 90)
+    for entry in by_arm_failures.get("native_pdf", []):
+        err = (entry["error"] or "(no error string)")[:240].replace("\n", " ")
+        print(f"  {entry['qid']}  doc={entry['doc_id']} pages={entry['pages']} cluster={entry['cluster']}")
+        print(f"    err: {err}")
+
+    summary: dict[str, Any] = {
+        "per_arm": {
+            arm: {
+                "n": n_per_arm[arm],
+                "failures": len(by_arm_failures[arm]),
+                "rate": len(by_arm_failures[arm]) / n_per_arm[arm],
+                "clusters": {
+                    cluster: len(items)
+                    for cluster, items in error_clusters[arm].items()
+                },
+                "rows": by_arm_failures[arm],
+            }
+            for arm in sorted(n_per_arm)
+        },
+        "per_pdf": {
+            pdf: [
+                {**r, "arm": r["arm"]} for r in failures
+            ]
+            for pdf, failures in by_pdf_failures.items()
+        },
+    }
+    OUT.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    print(f"\nWrote: {OUT}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/check_extraction_sizes.py
+++ b/surfsense_evals/scripts/check_extraction_sizes.py
@ -0,0 +1,60 @@
+"""Sanity check extraction sizes against Sonnet 4.5's context window.
+
+Sonnet 4.5 supports ~200k tokens. As a *very* rough heuristic, English
+markdown is ~4 chars/token, so anything over ~750k chars likely won't
+fit alongside the system + question + 512 max_output_tokens. Print
+warnings for any extraction that's at risk.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+MAP = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+
+CHARS_PER_TOKEN = 4
+CTX_TOKENS = 200_000
+PROMPT_OVERHEAD_TOKENS = 1_000  # system + question + format hint
+MAX_OUTPUT_TOKENS = 512
+SAFE_CHARS = (CTX_TOKENS - PROMPT_OVERHEAD_TOKENS - MAX_OUTPUT_TOKENS) * CHARS_PER_TOKEN
+
+
+def main() -> None:
+    rows = [
+        json.loads(line)
+        for line in MAP.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    total = len(rows)
+    arm_max: dict[str, tuple[int, str]] = {}
+    overflows: list[tuple[str, str, int]] = []
+    for row in rows:
+        for arm, ext in (row.get("extractions") or {}).items():
+            chars = int(ext.get("chars") or 0)
+            if arm not in arm_max or arm_max[arm][0] < chars:
+                arm_max[arm] = (chars, row["doc_id"])
+            if chars > SAFE_CHARS:
+                overflows.append((row["doc_id"], arm, chars))
+
+    print(f"PDFs in manifest: {total}")
+    print(f"safe char budget: {SAFE_CHARS:,}  (~{(SAFE_CHARS // CHARS_PER_TOKEN):,} tokens)")
+    print()
+    print("largest extraction per arm:")
+    for arm, (chars, doc_id) in sorted(arm_max.items()):
+        print(f"  {arm:25s}  {chars:>10,} chars  ({doc_id})")
+
+    print()
+    if overflows:
+        print(f"OVERFLOW RISK ({len(overflows)} extractions > safe budget):")
+        for doc_id, arm, chars in overflows:
+            est_tokens = chars // CHARS_PER_TOKEN
+            print(f"  {doc_id} :: {arm} :: {chars:,} chars (~{est_tokens:,} tokens)")
+    else:
+        print("no overflow risk — all extractions fit Sonnet 4.5's 200k context.")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/check_uploaded_status.py
+++ b/surfsense_evals/scripts/check_uploaded_status.py
@ -0,0 +1,77 @@
+"""Query SurfSense for the status of every MMLongBench PDF in scope.
+
+Uses the existing SurfSense documents client to query
+``/documents/status?document_ids=...`` for both the known-existing 5
+PDFs (doc ids 5219-5223) and the recently-uploaded mmlongbench batch
+(7577-7600 range).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+
+import httpx
+from dotenv import load_dotenv
+
+
+REPO = Path(__file__).resolve().parents[1]
+PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+
+
+async def main() -> None:
+    load_dotenv(REPO / ".env")
+    base = os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/")
+    token = os.environ.get("SURFSENSE_JWT")
+    if not token:
+        raise SystemExit("SURFSENSE_JWT missing from .env")
+
+    pdf_names = sorted(p.name for p in PDF_DIR.glob("*.pdf"))
+    print(f"local cached PDFs: {len(pdf_names)}")
+
+    candidate_ids = list(range(5219, 5224)) + list(range(7577, 7625))
+
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Accept": "application/json",
+    }
+    async with httpx.AsyncClient(timeout=30.0) as http:
+        r = await http.get(
+            f"{base}/api/v1/documents/status",
+            params={
+                "search_space_id": 55,
+                "document_ids": ",".join(str(d) for d in candidate_ids),
+            },
+            headers=headers,
+        )
+        r.raise_for_status()
+        items = r.json().get("items", [])
+
+    by_title: dict[str, dict] = {}
+    for it in items:
+        by_title[it.get("title", "")] = {
+            "id": it.get("id"),
+            "state": (it.get("status") or {}).get("state"),
+            "reason": (it.get("status") or {}).get("reason"),
+        }
+
+    by_state: dict[str, int] = {}
+    print()
+    for name in pdf_names:
+        info = by_title.get(name)
+        if info is None:
+            print(f"  [missing      ]              {name}")
+            by_state["missing"] = by_state.get("missing", 0) + 1
+        else:
+            tag = info["state"] or "?"
+            print(f"  [{tag:13s}] doc_id={info['id']:>5}  {name}")
+            by_state[tag] = by_state.get(tag, 0) + 1
+    print()
+    print("summary:")
+    for k, v in sorted(by_state.items()):
+        print(f"  {k}: {v}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/surfsense_evals/scripts/compute_adjusted_accuracy.py
+++ b/surfsense_evals/scripts/compute_adjusted_accuracy.py
@ -0,0 +1,112 @@
+"""Compute "intrinsic" accuracy by removing transient network errors.
+
+A failure is *transient* if it's:
+  * SSLError: SSL bad-record-mac (TLS hiccup)
+  * Cloudflare 502 / 503 (provider-side load shedding)
+  * empty_response with no error string and no other signal (likely
+    connection reset mid-stream)
+  * JSONDecodeError (parse error mid-stream)
+
+A failure is *intrinsic* if it's a hard limit:
+  * "exceeds .* limit" (size limits)
+  * context_length errors
+  * provider 400 with image / pdf decode failure
+  * malformed-input failures
+
+We re-compute accuracy with two denominators:
+  * raw acc       = correct / 171  (what the headline reports)
+  * adjusted acc  = correct / (171 - transient_failures)  (intrinsic)
+
+Outputs a table that we can drop straight into the blog.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+
+
+TRANSIENT_HINTS = (
+    "sslv3_alert_bad_record_mac",
+    "ssl_alert_bad_record_mac",
+    "ssl: ssl",
+    "cloudflare",
+    "error 502",
+    "error 503",
+    "bad gateway",
+    "service unavailable",
+    "gateway timeout",
+    "jsondecodeerror",
+)
+INTRINSIC_HINTS = (
+    "exceeds",
+    "context_length",
+    "context window",
+    "could not process pdf",
+    "could not process image",
+)
+
+
+def classify(error: str | None, raw_text: str) -> str:
+    err = (error or "").lower()
+    if not err and not raw_text.strip():
+        return "transient_empty"
+    if any(h in err for h in TRANSIENT_HINTS):
+        return "transient_ssl_or_5xx"
+    if any(h in err for h in INTRINSIC_HINTS):
+        return "intrinsic_limit"
+    if err:
+        return "other_error"
+    return "ok"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+    by_arm: dict[str, dict] = defaultdict(lambda: {
+        "n": 0, "correct": 0,
+        "transient_ssl_or_5xx": 0, "transient_empty": 0,
+        "intrinsic_limit": 0, "other_error": 0,
+    })
+    for row in rows:
+        arm = row["arm"]
+        m = by_arm[arm]
+        m["n"] += 1
+        graded = row.get("graded") or {}
+        if graded.get("correct"):
+            m["correct"] += 1
+        kind = classify(row.get("error"), row.get("raw_text") or "")
+        if kind != "ok":
+            m[kind] += 1
+
+    print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")
+    print("-" * 88)
+    for arm in sorted(by_arm):
+        m = by_arm[arm]
+        raw = m["correct"] / m["n"] * 100
+        transient = m["transient_ssl_or_5xx"] + m["transient_empty"]
+        intrinsic = m["intrinsic_limit"]
+        other = m["other_error"]
+        usable = m["n"] - transient
+        adj = m["correct"] / usable * 100 if usable else 0
+        print(
+            f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"
+        )
+
+    print()
+    print("transient   = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")
+    print("              succeed on retry; eval harness has no built-in retry today).")
+    print("intrinsic   = hard limit (e.g. >30MB Anthropic request, model context overflow).")
+    print("adj acc%    = correct / (n - transient) — what the arm scores when network noise")
+    print("              is removed; closest thing we have to a like-for-like quality number.")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/compute_blog_extras.py
+++ b/surfsense_evals/scripts/compute_blog_extras.py
@ -0,0 +1,381 @@
+"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
+per-PDF heterogeneity, latency/token distribution percentiles.
+
+Reads the merged post-retry artifact:
+
+    data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
+
+Outputs to stdout:
+
+  1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
+  2) Per-arm input/output token distribution (mean, p50, p95, max).
+  3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
+     same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
+     b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
+     two-sided p-value. We include both raw (using the original raw.jsonl)
+     and post-retry results.
+  4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
+
+Pure stdlib — no scipy/numpy.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import statistics
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+# ---------------------------------------------------------------------------
+# I/O
+# ---------------------------------------------------------------------------
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Distribution helpers
+# ---------------------------------------------------------------------------
+
+
+def _percentile(values: list[float], p: float) -> float:
+    """Linear-interpolation percentile (p in [0, 100])."""
+
+    if not values:
+        return 0.0
+    s = sorted(values)
+    if len(s) == 1:
+        return float(s[0])
+    k = (len(s) - 1) * (p / 100.0)
+    lo, hi = math.floor(k), math.ceil(k)
+    if lo == hi:
+        return float(s[int(k)])
+    return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
+
+
+# ---------------------------------------------------------------------------
+# McNemar exact-binomial p-value
+# ---------------------------------------------------------------------------
+
+
+def _binom_coef(n: int, k: int) -> int:
+    if k < 0 or k > n:
+        return 0
+    return math.comb(n, k)
+
+
+def _mcnemar_exact_pvalue(b: int, c: int) -> float:
+    """Two-sided exact-binomial McNemar p-value.
+
+    Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
+    on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
+    The two-sided p-value is
+
+        P(X <= min(b, c)) + P(X >= max(b, c))
+
+    computed exactly (cheap because b+c <= 27 in our run).
+    """
+
+    n = b + c
+    if n == 0:
+        return 1.0
+    k = min(b, c)
+    # Two-sided exact: 2 * P(X <= k) clipped at 1.0
+    cdf = sum(_binom_coef(n, i) for i in range(k + 1))
+    p = 2.0 * cdf / (2 ** n)
+    return min(1.0, p)
+
+
+def _mcnemar_table(rows: list[dict]) -> dict:
+    """Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
+
+    by_qid: dict[str, dict[str, bool]] = {}
+    arms_seen: set[str] = set()
+    for r in rows:
+        qid = r["qid"]
+        arm = r["arm"]
+        graded = r.get("graded") or {}
+        correct = bool(graded.get("correct"))
+        by_qid.setdefault(qid, {})[arm] = correct
+        arms_seen.add(arm)
+
+    arms = sorted(arms_seen)
+    qids = sorted(by_qid)
+    out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
+    for i, ai in enumerate(arms):
+        for aj in arms[i + 1:]:
+            b = c = both = neither = 0
+            for q in qids:
+                row = by_qid[q]
+                if ai not in row or aj not in row:
+                    continue
+                ci, cj = row[ai], row[aj]
+                if ci and not cj:
+                    b += 1
+                elif cj and not ci:
+                    c += 1
+                elif ci and cj:
+                    both += 1
+                else:
+                    neither += 1
+            p = _mcnemar_exact_pvalue(b, c)
+            out["pairs"].append({
+                "arm_i": ai, "arm_j": aj,
+                "b_i_only": b, "c_j_only": c,
+                "both_correct": both, "both_wrong": neither,
+                "p_value": p,
+            })
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Per-PDF heterogeneity
+# ---------------------------------------------------------------------------
+
+
+def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
+    """For each arm, per-PDF accuracy = correct/total questions on that PDF."""
+
+    bucket: dict[str, dict[str, list[bool]]] = {}
+    for r in rows:
+        arm = r["arm"]
+        pdf = r["doc_id"]
+        graded = r.get("graded") or {}
+        bucket.setdefault(arm, {}).setdefault(pdf, []).append(
+            bool(graded.get("correct"))
+        )
+
+    out: dict[str, dict] = {}
+    for arm, pdfs in bucket.items():
+        accs = [sum(b) / len(b) for b in pdfs.values() if b]
+        if not accs:
+            continue
+        out[arm] = {
+            "n_pdfs": len(accs),
+            "mean": statistics.mean(accs),
+            "std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
+            "min": min(accs),
+            "max": max(accs),
+            "p25": _percentile(accs, 25),
+            "p50": _percentile(accs, 50),
+            "p75": _percentile(accs, 75),
+            "n_pdfs_zero": sum(1 for a in accs if a == 0.0),
+            "n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
+        }
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Latency / token distributions
+# ---------------------------------------------------------------------------
+
+
+def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
+    by_arm: dict[str, list[float]] = {}
+    for r in rows:
+        lat = r.get("latency_ms")
+        if lat is None or lat == 0:
+            continue
+        by_arm.setdefault(r["arm"], []).append(float(lat))
+    out: dict[str, dict] = {}
+    for arm, lats in by_arm.items():
+        out[arm] = {
+            "n": len(lats),
+            "mean_s": statistics.mean(lats) / 1000,
+            "std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
+            "p10_s": _percentile(lats, 10) / 1000,
+            "p25_s": _percentile(lats, 25) / 1000,
+            "p50_s": _percentile(lats, 50) / 1000,
+            "p75_s": _percentile(lats, 75) / 1000,
+            "p90_s": _percentile(lats, 90) / 1000,
+            "p95_s": _percentile(lats, 95) / 1000,
+            "p99_s": _percentile(lats, 99) / 1000,
+            "max_s": max(lats) / 1000,
+            # Coefficient of variation: std / mean (unitless tail-fatness).
+            "cv": (
+                statistics.stdev(lats) / statistics.mean(lats)
+                if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
+            ),
+        }
+    return out
+
+
+def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
+    by_arm_in: dict[str, list[float]] = {}
+    by_arm_out: dict[str, list[float]] = {}
+    for r in rows:
+        t_in = r.get("input_tokens") or 0
+        t_out = r.get("output_tokens") or 0
+        if t_in:
+            by_arm_in.setdefault(r["arm"], []).append(float(t_in))
+        if t_out:
+            by_arm_out.setdefault(r["arm"], []).append(float(t_out))
+    out: dict[str, dict] = {}
+    for arm in sorted(set(by_arm_in) | set(by_arm_out)):
+        in_vals = by_arm_in.get(arm, [])
+        out_vals = by_arm_out.get(arm, [])
+        if not in_vals and not out_vals:
+            continue
+        entry: dict = {}
+        if in_vals:
+            entry["input"] = {
+                "n": len(in_vals),
+                "mean": statistics.mean(in_vals),
+                "p50": _percentile(in_vals, 50),
+                "p95": _percentile(in_vals, 95),
+                "max": max(in_vals),
+            }
+        if out_vals:
+            entry["output"] = {
+                "n": len(out_vals),
+                "mean": statistics.mean(out_vals),
+                "p50": _percentile(out_vals, 50),
+                "p95": _percentile(out_vals, 95),
+                "max": max(out_vals),
+            }
+        out[arm] = entry
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Pretty-printing
+# ---------------------------------------------------------------------------
+
+
+def _print_latency(title: str, lat: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
+              f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
+        s = lat[arm]
+        print(f"{arm:<25} {s['n']:>4} "
+              f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
+              f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
+              f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
+
+
+def _print_tokens(title: str, toks: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
+              f"  {'out mean':>9} {'out p95':>9}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(toks):
+        e = toks[arm]
+        ein = e.get("input")
+        eout = e.get("output")
+        if not ein:
+            continue
+        print(f"{arm:<25} "
+              f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f}  "
+              f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
+
+
+def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
+              f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(var, key=lambda a: -var[a]["mean"]):
+        s = var[arm]
+        print(f"{arm:<25} {s['n_pdfs']:>7} "
+              f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
+              f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
+              f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
+
+
+def _print_mcnemar(title: str, table: dict) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
+    header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
+              f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
+    print(header)
+    print("-" * len(header))
+    for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
+        sig = ""
+        if pair["p_value"] < 0.001:
+            sig = "***"
+        elif pair["p_value"] < 0.01:
+            sig = "**"
+        elif pair["p_value"] < 0.05:
+            sig = "*"
+        print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
+              f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
+              f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
+              f"{pair['p_value']:>13.4f} {sig:>4}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
+    args = parser.parse_args()
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    post_path = run_dir / "raw_post_retry.jsonl"
+    if not raw_path.exists() or not post_path.exists():
+        raise SystemExit(
+            "Missing raw.jsonl or raw_post_retry.jsonl. "
+            "Run scripts/compute_post_retry_accuracy.py first."
+        )
+
+    raw_rows = _read_jsonl(raw_path)
+    post_rows = _read_jsonl(post_path)
+
+    print(f"Run: {args.run_id}")
+    print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
+
+    # Latency uses post-retry rows (post-retry rows include the retry's own
+    # latency for recovered rows). For raw, recovered rows have latency=0
+    # because the harness recorded a failure.
+    _print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
+
+    _print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
+
+    _print_pdf_var(
+        "Per-PDF accuracy heterogeneity (post-retry)",
+        _per_pdf_stats(post_rows),
+    )
+
+    _print_mcnemar(
+        "McNemar pairwise (RAW, no retries)",
+        _mcnemar_table(raw_rows),
+    )
+    _print_mcnemar(
+        "McNemar pairwise (POST-RETRY)",
+        _mcnemar_table(post_rows),
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/surfsense_evals/scripts/compute_post_retry_accuracy.py
+++ b/surfsense_evals/scripts/compute_post_retry_accuracy.py
@ -0,0 +1,180 @@
+"""Recompute per-arm accuracy/F1 after merging retry survivors into raw.jsonl.
+
+Reads:
+  - data/multimodal_doc/runs/<run_id>/parser_compare/raw.jsonl
+  - data/multimodal_doc/runs/<run_id>/parser_compare/raw_retries.jsonl
+
+For each (arm, qid) present in the retry artifact:
+  - if the retry RECOVERED, the retry row replaces the original row (same
+    grader is reused — see ``mmlongbench/grader.py``);
+  - if the retry did NOT recover, the original row stays (still a failure,
+    so ``correct=False`` and ``f1=0``).
+
+Prints two tables side by side:
+  * Raw run (no retries) — matches §1 of the blog.
+  * Post-retry run        — final, "what would the headline have been if
+                              the harness had had retries from day one".
+
+It also writes ``data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl``
+so any downstream notebook / report can join straight on it.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+
+
+def _row_key(row: dict) -> tuple[str, str]:
+    return (str(row["arm"]), str(row["qid"]))
+
+
+def _is_failure(row: dict) -> bool:
+    if row.get("error"):
+        return True
+    if not (row.get("raw_text") or "").strip():
+        return True
+    return False
+
+
+def _summarise(rows_by_arm: dict[str, list[dict]]) -> dict[str, dict]:
+    out: dict[str, dict] = {}
+    for arm, rows in rows_by_arm.items():
+        n = len(rows)
+        n_correct = sum(1 for r in rows if r.get("graded", {}).get("correct"))
+        f1_sum = sum(float(r.get("graded", {}).get("f1") or 0.0) for r in rows)
+        n_fail = sum(1 for r in rows if _is_failure(r))
+        out[arm] = {
+            "n": n,
+            "n_correct": n_correct,
+            "n_failures": n_fail,
+            "accuracy": (n_correct / n) if n else 0.0,
+            "f1_mean": (f1_sum / n) if n else 0.0,
+            "failure_rate": (n_fail / n) if n else 0.0,
+        }
+    return out
+
+
+def _print_table(title: str, summary: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = f"{'arm':<25} {'n':>4} {'n_corr':>7} {'acc':>7} {'F1':>7} {'fails':>6} {'fail%':>7}"
+    print(header)
+    print("-" * len(header))
+    # stable order: highest accuracy first
+    arms_sorted = sorted(summary.items(), key=lambda kv: -kv[1]["accuracy"])
+    for arm, s in arms_sorted:
+        print(f"{arm:<25} {s['n']:>4} {s['n_correct']:>7} "
+              f"{s['accuracy']*100:>6.1f}% {s['f1_mean']*100:>6.1f}% "
+              f"{s['n_failures']:>6} {s['failure_rate']*100:>6.1f}%")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
+    args = parser.parse_args()
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    retry_path = run_dir / "raw_retries.jsonl"
+    out_path = run_dir / "raw_post_retry.jsonl"
+
+    if not raw_path.exists():
+        print(f"raw.jsonl not found at {raw_path}", file=sys.stderr)
+        return 1
+    if not retry_path.exists():
+        print(f"raw_retries.jsonl not found at {retry_path}", file=sys.stderr)
+        return 1
+
+    raw_rows = _read_jsonl(raw_path)
+    retry_rows = _read_jsonl(retry_path)
+
+    retry_by_key: dict[tuple[str, str], dict] = {
+        _row_key(r): r for r in retry_rows
+    }
+
+    merged_rows: list[dict] = []
+    n_replaced_recovered = 0
+    n_replaced_still_failed = 0
+    n_unchanged = 0
+    for row in raw_rows:
+        key = _row_key(row)
+        retry = retry_by_key.get(key)
+        if retry is None:
+            merged_rows.append(row)
+            n_unchanged += 1
+            continue
+        # The retry artifact carries a fresh ArmResult + grade in the same
+        # shape, plus a "retry" sub-object. We use the retry row whenever
+        # it represents a recovery; otherwise we keep the original (the
+        # retry confirms it is intrinsic, but the original row is the one
+        # the headline numbers were computed from, and the failure verdict
+        # is identical either way).
+        recovered = bool(retry.get("retry", {}).get("recovered"))
+        if recovered:
+            merged_rows.append(retry)
+            n_replaced_recovered += 1
+        else:
+            merged_rows.append(row)
+            n_replaced_still_failed += 1
+
+    # Persist merged jsonl for downstream consumers
+    with out_path.open("w", encoding="utf-8") as fh:
+        for r in merged_rows:
+            fh.write(json.dumps(r) + "\n")
+
+    # Bucket per arm
+    raw_by_arm: dict[str, list[dict]] = {}
+    for r in raw_rows:
+        raw_by_arm.setdefault(r["arm"], []).append(r)
+    post_by_arm: dict[str, list[dict]] = {}
+    for r in merged_rows:
+        post_by_arm.setdefault(r["arm"], []).append(r)
+
+    raw_summary = _summarise(raw_by_arm)
+    post_summary = _summarise(post_by_arm)
+
+    print()
+    print(f"Run: {args.run_id}")
+    print(f"Replaced (retry recovered):     {n_replaced_recovered}")
+    print(f"Kept original (retry still failed): {n_replaced_still_failed}")
+    print(f"Untouched rows:                 {n_unchanged}")
+    print(f"Wrote merged artifact: {out_path.relative_to(REPO)}")
+
+    _print_table("Raw run (no retries)", raw_summary)
+    _print_table("Post-retry run (final)", post_summary)
+
+    print()
+    print("Delta (post-retry minus raw):")
+    print(f"{'arm':<25} {'d_acc':>7} {'d_fails':>8}")
+    print("-" * 42)
+    for arm in sorted(set(raw_summary) | set(post_summary)):
+        r = raw_summary.get(arm)
+        p = post_summary.get(arm)
+        if not r or not p:
+            continue
+        d_acc = (p["accuracy"] - r["accuracy"]) * 100
+        d_fail = p["n_failures"] - r["n_failures"]
+        print(f"{arm:<25} {d_acc:>+6.1f}p {d_fail:>+7d}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/surfsense_evals/scripts/inspect_first30.py
+++ b/surfsense_evals/scripts/inspect_first30.py
@ -0,0 +1,59 @@
+"""Inspect what the first 30 MMLongBench-Doc PDFs would look like for scoping.
+
+Run from surfsense_evals/ root via:
+    python scripts/inspect_first30.py
+
+Prints which docs are already ingested (existing 5), which are new (25 to
+upload), how many questions cover those 30 PDFs, and the answerable /
+unanswerable + format mix.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import Counter
+from pathlib import Path
+
+
+def main() -> None:
+    qpath = Path("data/multimodal_doc/mmlongbench/questions.jsonl")
+    lines = qpath.read_text(encoding="utf-8").splitlines()
+    rows = [json.loads(line) for line in lines if line.strip()]
+
+    docs_by_id = sorted({r["doc_id"] for r in rows})
+    first30 = docs_by_id[:30]
+    existing5 = {
+        "05-03-18-political-release.pdf",
+        "0b85477387a9d0cc33fca0f4becaa0e5.pdf",
+        "0e94b4197b10096b1f4c699701570fbf.pdf",
+        "11-21-16-Updated-Post-Election-Release.pdf",
+        "12-15-15-ISIS-and-terrorism-release-final.pdf",
+    }
+    new25 = [d for d in first30 if d not in existing5]
+    print(
+        f"first 30 docs (alphabetical) — {len(new25)} new, "
+        f"{len(first30) - len(new25)} already in SurfSense"
+    )
+
+    qs_in_30 = [r for r in rows if r["doc_id"] in set(first30)]
+    fmts = Counter((r.get("answer_format") or "").lower() for r in qs_in_30)
+    answerable = sum(v for k, v in fmts.items() if k != "none")
+    unanswerable = fmts.get("none", 0)
+
+    print(
+        f"questions covering first 30 docs: total={len(qs_in_30)}  "
+        f"answerable={answerable}  unanswerable={unanswerable}"
+    )
+    print(
+        f"avg Qs/PDF: {len(qs_in_30) / 30:.1f}  "
+        f"answerable/PDF: {answerable / 30:.1f}"
+    )
+    print(f"format mix in scope: {dict(fmts)}")
+    print()
+    print("25 new PDFs to ingest:")
+    for d in new25:
+        print(f"  - {d}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
+++ b/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
@ -0,0 +1,100 @@
+"""Stub the mmlongbench manifest so parser_compare can extract in parallel.
+
+The mmlongbench Surfsense ingest writes its manifest only at the very
+end of the upload pipeline (~hours of celery work). parser_compare's
+ingest, on the other hand, just needs a list of (doc_id, pdf_path)
+tuples to know which PDFs to extract — it doesn't care about the
+SurfSense ``document_id`` (the runner does, later, after a refresh).
+
+This script extends the existing manifest with the *additional* PDFs
+that mmlongbench has already cached on disk (i.e. all 30 PDFs in
+``data/multimodal_doc/mmlongbench/pdfs/`` even though only 5 have
+SurfSense ``document_id``s yet) so parser_compare can run all four
+extractions for them in parallel with the SurfSense ingest.
+
+After mmlongbench finishes, re-run::
+
+    python -m surfsense_evals ingest multimodal_doc parser_compare \
+        --max-docs 30
+
+…to refresh ``parser_compare_doc_map.jsonl`` with the now-populated
+``document_id`` values for the 25 new PDFs. The extractions
+themselves are cached on disk so the second pass is essentially free.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+REPO = Path(__file__).resolve().parents[1]
+MAP_PATH = REPO / "data" / "multimodal_doc" / "maps" / "mmlongbench_doc_map.jsonl"
+PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+QUESTIONS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
+
+
+def _question_count_per_doc() -> dict[str, int]:
+    counts: dict[str, int] = {}
+    with QUESTIONS.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            counts[row["doc_id"]] = counts.get(row["doc_id"], 0) + 1
+    return counts
+
+
+def main() -> None:
+    if not MAP_PATH.exists():
+        raise SystemExit(
+            f"manifest not found at {MAP_PATH} — "
+            "run `surfsense_evals ingest multimodal_doc mmlongbench` first."
+        )
+
+    existing_lines = MAP_PATH.read_text(encoding="utf-8").splitlines()
+    existing_rows: list[dict] = []
+    settings_line = None
+    for line in existing_lines:
+        line = line.strip()
+        if not line:
+            continue
+        row = json.loads(line)
+        if "__settings__" in row:
+            settings_line = line
+        else:
+            existing_rows.append(row)
+
+    by_doc_id = {r["doc_id"]: r for r in existing_rows}
+    counts = _question_count_per_doc()
+
+    cached_pdfs = sorted(p for p in PDF_DIR.glob("*.pdf"))
+    print(f"existing manifest entries: {len(existing_rows)}")
+    print(f"cached PDFs on disk:       {len(cached_pdfs)}")
+
+    added = 0
+    for pdf in cached_pdfs:
+        if pdf.name in by_doc_id:
+            continue
+        by_doc_id[pdf.name] = {
+            "doc_id": pdf.name,
+            "document_id": None,
+            "pdf_path": str(pdf),
+            "n_questions": counts.get(pdf.name, 0),
+        }
+        added += 1
+
+    out_lines: list[str] = []
+    if settings_line:
+        out_lines.append(settings_line)
+    for doc_id in sorted(by_doc_id):
+        out_lines.append(json.dumps(by_doc_id[doc_id]))
+    MAP_PATH.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
+
+    print(f"added {added} stub rows; manifest now has {len(by_doc_id)} PDFs")
+    print(f"wrote: {MAP_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/peek_crag_run.py
+++ b/surfsense_evals/scripts/peek_crag_run.py
@ -0,0 +1,37 @@
+"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
+
+from __future__ import annotations
+
+import glob
+import json
+from collections import defaultdict
+
+
+def main() -> None:
+    raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
+    print(f"Reading: {raw_path}")
+    rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
+    by_q: dict[str, dict[str, dict]] = defaultdict(dict)
+    for r in rows:
+        by_q[r["qid"]][r["arm"]] = r
+
+    for qid, arms in list(by_q.items()):
+        b = arms.get("bare_llm", {})
+        l = arms.get("long_context", {})
+        s = arms.get("surfsense", {})
+        print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
+        print(f"  question: {b.get('extra', {}).get('question', '?')!r}")
+        print(f"  gold: {b.get('gold')!r}")
+        for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
+            grade = a.get("graded", {})
+            text = (a.get("raw_text") or "").strip()
+            tail = text[-200:] if text else ""
+            print(
+                f"  [{arm_name}] grade={grade.get('grade')} "
+                f"method={grade.get('method')}"
+            )
+            print(f"    -> {tail!r}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/peek_disagreements.py
+++ b/surfsense_evals/scripts/peek_disagreements.py
@ -0,0 +1,64 @@
+"""Show questions where SurfSense was wrong but long-context was right (and vice versa)."""
+
+from __future__ import annotations
+
+import glob
+import json
+from collections import defaultdict
+
+
+def main() -> None:
+    raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
+    print(f"Reading: {raw_path}")
+    rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
+    by_q: dict[str, dict[str, dict]] = defaultdict(dict)
+    for r in rows:
+        by_q[r["qid"]][r["arm"]] = r
+
+    surf_wrong_lc_right = []
+    lc_wrong_surf_right = []
+    surf_wrong_bare_right = []
+    for qid, arms in by_q.items():
+        b = arms.get("bare_llm", {}).get("graded", {}).get("grade")
+        lc = arms.get("long_context", {}).get("graded", {}).get("grade")
+        s = arms.get("surfsense", {}).get("graded", {}).get("grade")
+        if s == "incorrect" and lc == "correct":
+            surf_wrong_lc_right.append(qid)
+        if lc == "incorrect" and s == "correct":
+            lc_wrong_surf_right.append(qid)
+        if s == "incorrect" and b == "correct":
+            surf_wrong_bare_right.append(qid)
+
+    print(f"\nSurfSense INCORRECT but Long-Context CORRECT: {len(surf_wrong_lc_right)}")
+    print(f"Long-Context INCORRECT but SurfSense CORRECT: {len(lc_wrong_surf_right)}")
+    print(f"SurfSense INCORRECT but Bare CORRECT: {len(surf_wrong_bare_right)}")
+
+    print("\n=== Where SurfSense is wrong but long-context is right (top 5) ===")
+    for qid in surf_wrong_lc_right[:5]:
+        arms = by_q[qid]
+        b = arms.get("bare_llm", {})
+        print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
+        print(f"  GOLD: {b.get('gold')!r}")
+        for arm_name in ("bare_llm", "long_context", "surfsense"):
+            a = arms.get(arm_name, {})
+            t = (a.get("raw_text") or "").strip()
+            tail = t[-180:] if t else ""
+            grade = a.get("graded", {})
+            print(f"  [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
+
+    print("\n=== Where Long-Context is wrong but SurfSense is right (top 5) ===")
+    for qid in lc_wrong_surf_right[:5]:
+        arms = by_q[qid]
+        b = arms.get("bare_llm", {})
+        print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
+        print(f"  GOLD: {b.get('gold')!r}")
+        for arm_name in ("bare_llm", "long_context", "surfsense"):
+            a = arms.get(arm_name, {})
+            t = (a.get("raw_text") or "").strip()
+            tail = t[-180:] if t else ""
+            grade = a.get("graded", {})
+            print(f"  [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/retry_failed_questions.py
+++ b/surfsense_evals/scripts/retry_failed_questions.py
@ -0,0 +1,636 @@
+"""Retry only the failed (arm, question) pairs from a previous parser_compare run.
+
+The original parser_compare run records one row per (arm, qid) in
+``raw.jsonl``. Some of those rows came back with transient transport
+errors (SSL alerts, gateway 502s, empty SSE streams) or empty
+``raw_text``. This script re-issues *only* those calls with exponential
+backoff so we can see how many recover.
+
+Design constraints / choices:
+
+* **No re-ingest.** All cached PDFs and parser-extracted markdown stay
+  on disk. We rebuild ``ArmRequest`` objects from the existing manifest
+  + the original ``mmlongbench/questions.jsonl``.
+* **No SurfSense backend or celery required.** SurfSense had 0
+  reported failures; this script will skip any ``surfsense_agentic``
+  rows it encounters and warn rather than try to start the backend.
+* **Original ``raw.jsonl`` is never mutated.** Retries land in a
+  sibling ``raw_retries.jsonl`` so the original artifact stays
+  citeable.
+* **Idempotent.** Re-running this script re-tries the same set of
+  failed rows from ``raw.jsonl``. If you want to merge survivor rows
+  back in, do that as a separate aggregation step.
+
+Usage:
+
+    python scripts/retry_failed_questions.py \
+        --run-id 2026-05-14T00-53-19Z \
+        --max-attempts 5 \
+        --concurrency 2
+
+Outputs (written next to the original raw.jsonl):
+
+* ``raw_retries.jsonl`` — one line per retried (arm, qid). Each line
+  carries the original error, every retry attempt's timing/error,
+  and the final result (incl. grade) so you can drop it straight
+  into a notebook.
+* ``raw_retries_summary.json`` — per-arm tried/recovered/still-failed
+  counts and an aggregated retry-success rate.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+REPO = Path(__file__).resolve().parents[1]
+SRC = REPO / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from dotenv import load_dotenv  # noqa: E402
+
+from surfsense_evals.core.arms import (  # noqa: E402
+    ArmRequest,
+    ArmResult,
+    BareLlmArm,
+    NativePdfArm,
+)
+from surfsense_evals.core.parse.freeform_answer import (  # noqa: E402
+    extract_freeform_answer,
+)
+from surfsense_evals.core.providers.openrouter_chat import (  # noqa: E402
+    OpenRouterChatProvider,
+)
+from surfsense_evals.core.providers.openrouter_pdf import (  # noqa: E402
+    OpenRouterPdfProvider,
+    PdfEngine,
+)
+from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade  # noqa: E402
+from surfsense_evals.suites.multimodal_doc.parser_compare.prompt import (  # noqa: E402
+    build_long_context_prompt,
+    build_native_pdf_prompt,
+)
+
+logger = logging.getLogger("retry_failed_questions")
+
+LC_ARMS = {
+    "azure_basic_lc",
+    "azure_premium_lc",
+    "llamacloud_basic_lc",
+    "llamacloud_premium_lc",
+}
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_failure_row(row: dict[str, Any]) -> bool:
+    """A row counts as failed if it raised an error OR returned empty text.
+
+    We retry both because the empty-stream case is the same operational
+    failure mode (the call returned nothing usable) — we just didn't
+    raise it as an exception.
+    """
+
+    if row.get("error"):
+        return True
+    if not (row.get("raw_text") or "").strip():
+        return True
+    return False
+
+
+@dataclass
+class FailedRow:
+    arm: str
+    qid: str
+    doc_id: str
+    answer_format: str
+    gold: str
+    pages: int
+    document_id: int | None
+    original_error: str | None
+    original_row: dict[str, Any]
+
+
+def _load_failed_rows(raw_path: Path) -> list[FailedRow]:
+    out: list[FailedRow] = []
+    with raw_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if not _is_failure_row(row):
+                continue
+            out.append(FailedRow(
+                arm=str(row["arm"]),
+                qid=str(row["qid"]),
+                doc_id=str(row["doc_id"]),
+                answer_format=str(row.get("answer_format") or ""),
+                gold=str(row.get("gold") or ""),
+                pages=int(row.get("pages") or 0),
+                document_id=row.get("document_id"),
+                original_error=row.get("error"),
+                original_row=row,
+            ))
+    return out
+
+
+def _load_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
+    out: dict[str, dict[str, Any]] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            out[str(row["doc_id"])] = row
+    return out
+
+
+def _load_question_text_index(
+    questions_jsonl: Path,
+) -> dict[tuple[str, int], dict[str, Any]]:
+    """Map (doc_id, per_doc_index) -> raw question row.
+
+    qids in raw.jsonl are formatted ``{doc_id}::Q{NNN}`` where NNN is
+    the per-doc index. Reproducing the runner's question selection
+    requires walking ``questions.jsonl`` in order and assigning
+    indices per doc_id (so we match the runner's ``per_doc_idx`` logic
+    in ``_select_questions``).
+    """
+
+    out: dict[tuple[str, int], dict[str, Any]] = {}
+    per_doc_idx: dict[str, int] = {}
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("doc_id") or "")
+            if not doc_id:
+                continue
+            idx = per_doc_idx.get(doc_id, 0)
+            per_doc_idx[doc_id] = idx + 1
+            out[(doc_id, idx)] = row
+    return out
+
+
+def _qid_index(qid: str) -> int:
+    """Parse the per-doc question index out of a qid like ``foo.pdf::Q007``."""
+
+    _, _, q_part = qid.rpartition("::")
+    if not q_part.startswith("Q"):
+        raise ValueError(f"unexpected qid shape: {qid!r}")
+    return int(q_part[1:])
+
+
+# ---------------------------------------------------------------------------
+# Request building (mirrors runner.py exactly so prompts are byte-identical)
+# ---------------------------------------------------------------------------
+
+
+def _build_native_request(
+    qid: str, question: str, answer_format: str, pdf_path: Path,
+    *, max_output_tokens: int,
+) -> ArmRequest:
+    return ArmRequest(
+        question_id=qid,
+        prompt=build_native_pdf_prompt(question, answer_format=answer_format),
+        pdf_paths=[pdf_path],
+        options={"max_tokens": max_output_tokens},
+    )
+
+
+def _build_lc_request(
+    qid: str, question: str, answer_format: str, doc_id: str, md_path: Path,
+) -> ArmRequest:
+    if not md_path.exists():
+        raise FileNotFoundError(
+            f"Missing parser extraction at {md_path}; cannot retry LC arm."
+        )
+    markdown = md_path.read_text(encoding="utf-8")
+    return ArmRequest(
+        question_id=qid,
+        prompt=build_long_context_prompt(
+            question,
+            answer_format=answer_format,
+            document_markdown=markdown,
+            document_label=doc_id,
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Retry driver
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class AttemptLog:
+    attempt: int
+    started_iso: str
+    latency_ms: int
+    error: str | None
+    raw_text_chars: int
+
+
+@dataclass
+class RetryOutcome:
+    arm: str
+    qid: str
+    attempts: list[AttemptLog]
+    final_result: ArmResult
+    recovered: bool
+
+
+async def _retry_one(
+    arm_obj: Any, request: ArmRequest, *,
+    arm_name: str,
+    qid: str,
+    max_attempts: int,
+    base_delay: float,
+    max_delay: float,
+) -> RetryOutcome:
+    attempts: list[AttemptLog] = []
+    final: ArmResult | None = None
+    for attempt in range(1, max_attempts + 1):
+        started_iso = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+        t0 = time.monotonic()
+        result = await arm_obj.answer(request)
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        raw_text = (result.raw_text or "").strip()
+        attempt_error = result.error
+        if not attempt_error and not raw_text:
+            attempt_error = "EmptyResponse: stream ended with no text"
+        attempts.append(AttemptLog(
+            attempt=attempt,
+            started_iso=started_iso,
+            latency_ms=latency_ms,
+            error=attempt_error,
+            raw_text_chars=len(raw_text),
+        ))
+        final = result
+        if not attempt_error and raw_text:
+            return RetryOutcome(
+                arm=arm_name, qid=qid, attempts=attempts,
+                final_result=result, recovered=True,
+            )
+        if attempt < max_attempts:
+            delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
+            delay = delay * (0.5 + random.random())
+            logger.info(
+                "[%s::%s] attempt %d/%d failed (%s); sleeping %.1fs",
+                arm_name, qid, attempt, max_attempts, attempt_error, delay,
+            )
+            await asyncio.sleep(delay)
+    assert final is not None
+    return RetryOutcome(
+        arm=arm_name, qid=qid, attempts=attempts,
+        final_result=final, recovered=False,
+    )
+
+
+async def _gather_with_limit(coros: list, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+async def _run(args: argparse.Namespace) -> int:
+    load_dotenv(REPO / ".env")
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    if not raw_path.exists():
+        raise SystemExit(f"raw.jsonl not found at {raw_path}")
+
+    map_path = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+    questions_jsonl = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
+    if not map_path.exists():
+        raise SystemExit(f"parser_compare manifest not found at {map_path}")
+    if not questions_jsonl.exists():
+        raise SystemExit(f"mmlongbench questions not found at {questions_jsonl}")
+
+    failed = _load_failed_rows(raw_path)
+    if not failed:
+        logger.info("No failed rows in %s — nothing to retry.", raw_path)
+        return 0
+
+    # SurfSense rows: warn and skip; we don't want to start backend just to
+    # defensively retry a 0-failure arm.
+    surf_failed = [f for f in failed if f.arm == "surfsense_agentic"]
+    if surf_failed:
+        logger.warning(
+            "Skipping %d surfsense_agentic failures; this script doesn't drive the backend. "
+            "If you want those retried too, start backend + celery and rerun "
+            "with --include-surfsense.",
+            len(surf_failed),
+        )
+        if not args.include_surfsense:
+            failed = [f for f in failed if f.arm != "surfsense_agentic"]
+    else:
+        logger.info("No surfsense_agentic failures; backend/celery not needed for this retry.")
+
+    if not failed:
+        logger.info("Nothing left to retry after filtering.")
+        return 0
+
+    by_arm_count: dict[str, int] = {}
+    for f in failed:
+        by_arm_count[f.arm] = by_arm_count.get(f.arm, 0) + 1
+    logger.info(
+        "Loaded %d failed rows across %d arms: %s",
+        len(failed), len(by_arm_count),
+        ", ".join(f"{a}={n}" for a, n in sorted(by_arm_count.items())),
+    )
+
+    doc_map = _load_doc_map(map_path)
+    qtext_idx = _load_question_text_index(questions_jsonl)
+
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        raise SystemExit("OPENROUTER_API_KEY missing from environment / .env")
+
+    native_provider = OpenRouterPdfProvider(
+        api_key=api_key,
+        base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+        model=args.llm_model,
+        engine=PdfEngine(args.pdf_engine),
+    )
+    native_arm = NativePdfArm(
+        provider=native_provider, max_output_tokens=args.max_output_tokens,
+    )
+
+    lc_arms: dict[str, BareLlmArm] = {}
+    for arm_name in sorted({f.arm for f in failed} & LC_ARMS):
+        lc_provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+            model=args.llm_model,
+        )
+        lc_arms[arm_name] = BareLlmArm(
+            provider=lc_provider,
+            max_output_tokens=args.max_output_tokens,
+            name=arm_name,
+        )
+
+    coros: list = []
+    plan: list[tuple[FailedRow, ArmRequest, Any]] = []
+
+    for f in failed:
+        # Look up the question text from questions.jsonl
+        try:
+            q_idx = _qid_index(f.qid)
+        except Exception:
+            logger.error("Bad qid %r — skipping", f.qid)
+            continue
+        qrow = qtext_idx.get((f.doc_id, q_idx))
+        if qrow is None:
+            logger.error(
+                "Could not find question text for %s (idx %d) — skipping",
+                f.doc_id, q_idx,
+            )
+            continue
+        question_text = str(qrow.get("question") or "").strip()
+        answer_format = str(qrow.get("answer_format") or f.answer_format or "").strip().lower()
+
+        map_row = doc_map.get(f.doc_id)
+        if map_row is None:
+            logger.error("doc_id %s not in manifest — skipping", f.doc_id)
+            continue
+
+        if f.arm == "native_pdf":
+            pdf_path = Path(map_row["pdf_path"])
+            if not pdf_path.exists():
+                logger.error("PDF missing on disk: %s — skipping", pdf_path)
+                continue
+            request = _build_native_request(
+                f.qid, question_text, answer_format, pdf_path,
+                max_output_tokens=args.max_output_tokens,
+            )
+            arm_obj = native_arm
+        elif f.arm in LC_ARMS:
+            ext_blob = (map_row.get("extractions") or {}).get(f.arm) or {}
+            md_path_str = ext_blob.get("markdown_path")
+            if not md_path_str or ext_blob.get("status") != "ok":
+                logger.error(
+                    "Missing extraction for %s on %s — cannot retry; skipping",
+                    f.arm, f.doc_id,
+                )
+                continue
+            request = _build_lc_request(
+                f.qid, question_text, answer_format, f.doc_id, Path(md_path_str),
+            )
+            arm_obj = lc_arms[f.arm]
+        else:
+            logger.warning("Unhandled arm %s — skipping", f.arm)
+            continue
+
+        plan.append((f, request, arm_obj))
+        coros.append(_retry_one(
+            arm_obj, request,
+            arm_name=f.arm, qid=f.qid,
+            max_attempts=args.max_attempts,
+            base_delay=args.base_delay,
+            max_delay=args.max_delay,
+        ))
+
+    if not coros:
+        logger.warning("Nothing to retry after request building.")
+        return 0
+
+    logger.info(
+        "Retrying %d failed rows with up to %d attempts each "
+        "(base_delay=%.1fs, max_delay=%.1fs, concurrency=%d).",
+        len(coros), args.max_attempts, args.base_delay, args.max_delay,
+        args.concurrency,
+    )
+
+    started = time.monotonic()
+    outcomes: list[RetryOutcome] = await _gather_with_limit(
+        coros, concurrency=args.concurrency,
+    )
+    elapsed = time.monotonic() - started
+    logger.info("Retry pass finished in %.1fs.", elapsed)
+
+    out_path = run_dir / "raw_retries.jsonl"
+    summary_path = run_dir / "raw_retries_summary.json"
+
+    per_arm_recovered: dict[str, int] = {}
+    per_arm_total: dict[str, int] = {}
+    per_arm_attempts_dist: dict[str, list[int]] = {}
+
+    with out_path.open("w", encoding="utf-8") as fh:
+        for (f, _req, _arm_obj), outcome in zip(plan, outcomes, strict=True):
+            per_arm_total[outcome.arm] = per_arm_total.get(outcome.arm, 0) + 1
+            if outcome.recovered:
+                per_arm_recovered[outcome.arm] = (
+                    per_arm_recovered.get(outcome.arm, 0) + 1
+                )
+            per_arm_attempts_dist.setdefault(outcome.arm, []).append(
+                len(outcome.attempts)
+            )
+
+            g = grade(
+                pred=extract_freeform_answer(outcome.final_result.raw_text or ""),
+                gold=f.gold,
+                answer_format=f.answer_format,
+            )
+            row = {
+                "qid": f.qid,
+                "doc_id": f.doc_id,
+                "arm": f.arm,
+                "answer_format": f.answer_format,
+                "gold": f.gold,
+                "pages": f.pages,
+                "document_id": f.document_id,
+                "original_error": f.original_error,
+                "retry": {
+                    "max_attempts": args.max_attempts,
+                    "n_attempts": len(outcome.attempts),
+                    "recovered": outcome.recovered,
+                    "attempts": [
+                        {
+                            "attempt": a.attempt,
+                            "started_iso": a.started_iso,
+                            "latency_ms": a.latency_ms,
+                            "error": a.error,
+                            "raw_text_chars": a.raw_text_chars,
+                        }
+                        for a in outcome.attempts
+                    ],
+                },
+                **outcome.final_result.to_jsonl(),
+                "graded": {
+                    "correct": g.correct,
+                    "f1": g.f1,
+                    "method": g.method,
+                    "normalised_pred": g.normalised_pred,
+                    "normalised_gold": g.normalised_gold,
+                },
+            }
+            fh.write(json.dumps(row) + "\n")
+
+    summary = {
+        "run_id": args.run_id,
+        "raw_retries_path": str(out_path.relative_to(REPO)),
+        "n_failed_rows_input": len(failed),
+        "n_retried": len(coros),
+        "elapsed_s": round(elapsed, 1),
+        "config": {
+            "max_attempts": args.max_attempts,
+            "base_delay": args.base_delay,
+            "max_delay": args.max_delay,
+            "concurrency": args.concurrency,
+            "llm_model": args.llm_model,
+            "pdf_engine": args.pdf_engine,
+            "max_output_tokens": args.max_output_tokens,
+        },
+        "per_arm": {
+            arm: {
+                "tried": per_arm_total.get(arm, 0),
+                "recovered": per_arm_recovered.get(arm, 0),
+                "still_failed": (
+                    per_arm_total.get(arm, 0) - per_arm_recovered.get(arm, 0)
+                ),
+                "recovery_rate": (
+                    per_arm_recovered.get(arm, 0) / per_arm_total[arm]
+                    if per_arm_total.get(arm) else 0.0
+                ),
+                "attempts_distribution": sorted(per_arm_attempts_dist.get(arm, [])),
+            }
+            for arm in sorted(per_arm_total)
+        },
+        "totals": {
+            "tried": sum(per_arm_total.values()),
+            "recovered": sum(per_arm_recovered.values()),
+            "still_failed": sum(per_arm_total.values()) - sum(per_arm_recovered.values()),
+        },
+    }
+    summary_path.write_text(
+        json.dumps(summary, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+
+    print()
+    print("=" * 78)
+    print("Retry pass summary")
+    print("=" * 78)
+    header = f"{'arm':<25} {'tried':>6} {'recovered':>10} {'still fail':>11} {'rate':>7}"
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(per_arm_total):
+        tried = per_arm_total[arm]
+        rec = per_arm_recovered.get(arm, 0)
+        rate = (rec / tried * 100) if tried else 0.0
+        print(f"{arm:<25} {tried:>6} {rec:>10} {tried - rec:>11} {rate:>6.1f}%")
+    total = sum(per_arm_total.values())
+    rec_total = sum(per_arm_recovered.values())
+    rate_total = (rec_total / total * 100) if total else 0.0
+    print("-" * len(header))
+    print(f"{'TOTAL':<25} {total:>6} {rec_total:>10} {total - rec_total:>11} "
+          f"{rate_total:>6.1f}%")
+    print()
+    print(f"Wrote {out_path.relative_to(REPO)}")
+    print(f"Wrote {summary_path.relative_to(REPO)}")
+    return 0
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--run-id", default="2026-05-14T00-53-19Z",
+        help="Run timestamp under data/multimodal_doc/runs/. Default is the "
+             "n=171 production run we wrote up in the blog.",
+    )
+    parser.add_argument("--max-attempts", type=int, default=5)
+    parser.add_argument("--base-delay", type=float, default=1.0,
+                        help="Base seconds for exponential backoff (default 1s).")
+    parser.add_argument("--max-delay", type=float, default=30.0,
+                        help="Cap on per-retry sleep (default 30s).")
+    parser.add_argument("--concurrency", type=int, default=2,
+                        help="Parallel retries in flight (default 2 — keep low "
+                             "to avoid the same transport stress that caused "
+                             "the original failures).")
+    parser.add_argument("--llm-model", default="anthropic/claude-sonnet-4.5")
+    parser.add_argument("--pdf-engine", default="native",
+                        choices=[e.value for e in PdfEngine])
+    parser.add_argument("--max-output-tokens", type=int, default=512)
+    parser.add_argument(
+        "--include-surfsense", action="store_true",
+        help="Also retry surfsense_agentic failures (requires backend + celery up). "
+             "Default is to skip them since the n=171 run had 0 SurfSense failures.",
+    )
+    args = parser.parse_args()
+    raise SystemExit(asyncio.run(_run(args)))
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/summarise_crag_run.py
+++ b/surfsense_evals/scripts/summarise_crag_run.py
@ -0,0 +1,65 @@
+"""Render a quick textual summary of the latest CRAG run."""
+
+from __future__ import annotations
+
+import glob
+import json
+
+
+def main() -> None:
+    runs = sorted(glob.glob("data/research/runs/*/crag/run_artifact.json"))
+    if not runs:
+        print("(no CRAG runs found)")
+        return
+    m = json.load(open(runs[-1], encoding="utf-8"))
+    metrics = m["metrics"]
+
+    print(f"Reading: {runs[-1]}")
+    print(f"n_questions: {m['extra']['n_questions']}")
+    print()
+    print("=== ARMS ===")
+    for arm in ("bare_llm", "long_context", "surfsense"):
+        d = metrics[arm]
+        print(
+            f"{arm:14s}: "
+            f"acc={d['accuracy']*100:5.1f}% (Wilson 95% CI "
+            f"{d['ci_low']*100:.1f}-{d['ci_high']*100:.1f}) | "
+            f"correct={d['correct_rate']*100:5.1f}% "
+            f"missing={d['missing_rate']*100:5.1f}% "
+            f"incorrect={d['incorrect_rate']*100:5.1f}% | "
+            f"truth={d['truthfulness_score']*100:+5.1f}%"
+        )
+
+    print()
+    print("=== DELTAS ===")
+    for key, d in metrics["deltas"].items():
+        print(
+            f"{key:30s}: acc={d['accuracy_pp']:+5.1f}pp "
+            f"truth={d['truthfulness_score_pp']:+5.1f}pp "
+            f"McNemar p={d['mcnemar_p_value']:.4f} ({d['mcnemar_method']}) "
+            f"bootstrap CI [{d['bootstrap_ci_low']:+.1f}, {d['bootstrap_ci_high']:+.1f}]"
+        )
+
+    print()
+    print("=== PER-QUESTION-TYPE TRUTHFULNESS ===")
+    for qt, row in sorted(metrics["per_question_type"].items()):
+        n = row["n"]
+        pieces = [f"{qt:20s} (n={n:3d}):"]
+        for arm in ("bare_llm", "long_context", "surfsense"):
+            if arm in row:
+                pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
+        print(" ".join(pieces))
+
+    print()
+    print("=== PER-DOMAIN TRUTHFULNESS ===")
+    for dom, row in sorted(metrics["per_domain"].items()):
+        n = row["n"]
+        pieces = [f"{dom:10s} (n={n:3d}):"]
+        for arm in ("bare_llm", "long_context", "surfsense"):
+            if arm in row:
+                pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
+        print(" ".join(pieces))
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/summarise_parser_compare_run.py
+++ b/surfsense_evals/scripts/summarise_parser_compare_run.py
@ -0,0 +1,122 @@
+"""Slice the parser_compare raw.jsonl for the n=171 run.
+
+Reports per-arm:
+  * tokens & cost stats (input/output mean, $/Q distribution)
+  * failures (status != ok or empty raw_text)
+  * answer_format breakdown (accuracy by str/int/float/list)
+
+Plus surfsense agentic breakdown so we can compare apples to apples
+even though the new_chat SSE doesn't surface per-call token counts.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+
+REPO = Path(__file__).resolve().parents[1]
+RUN_DIR = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN_DIR / "raw.jsonl"
+ARTIFACT = RUN_DIR / "run_artifact.json"
+
+
+def main() -> None:
+    rows = [json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines() if line.strip()]
+    print(f"raw rows: {len(rows)}")
+
+    by_qid: dict[str, list[dict]] = defaultdict(list)
+    for row in rows:
+        by_qid[row["qid"]].append(row)
+    print(f"unique questions: {len(by_qid)}")
+
+    arm_metrics: dict[str, dict] = defaultdict(lambda: {
+        "n": 0, "n_correct": 0, "n_failed": 0, "n_empty": 0,
+        "costs": [], "in_tokens": [], "out_tokens": [], "latency_ms": [],
+        "by_format": defaultdict(lambda: {"n": 0, "correct": 0}),
+    })
+
+    for row in rows:
+        arm = row["arm"]
+        m = arm_metrics[arm]
+        m["n"] += 1
+        graded = row.get("graded") or {}
+        if graded.get("correct"):
+            m["n_correct"] += 1
+
+        err = row.get("error")
+        raw_text = row.get("raw_text") or ""
+        if err:
+            m["n_failed"] += 1
+        elif not raw_text.strip():
+            m["n_empty"] += 1
+
+        cost = row.get("cost_usd")
+        if cost is not None:
+            m["costs"].append(float(cost))
+        ut = row.get("usage") or {}
+        if ut.get("prompt_tokens"):
+            m["in_tokens"].append(ut["prompt_tokens"])
+        if ut.get("completion_tokens"):
+            m["out_tokens"].append(ut["completion_tokens"])
+        if row.get("latency_ms"):
+            m["latency_ms"].append(row["latency_ms"])
+
+        fmt = row.get("answer_format") or "unknown"
+        m["by_format"][fmt]["n"] += 1
+        if graded.get("correct"):
+            m["by_format"][fmt]["correct"] += 1
+
+    print()
+    print("=" * 100)
+    print(f"{'arm':<25} {'n':>4} {'acc%':>6} {'F1%':>6} {'fail':>5} {'$ mean':>10} {'$ median':>10} {'in tok mean':>12} {'out tok mean':>12} {'p50 ms':>8}")
+    print("=" * 100)
+    art = json.loads(ARTIFACT.read_text(encoding="utf-8"))
+    per_arm_art = art["metrics"]["per_arm"]
+    for arm, m in sorted(arm_metrics.items()):
+        acc = m["n_correct"] / m["n"] * 100
+        fail = m["n_failed"]
+        cost_mean = statistics.mean(m["costs"]) if m["costs"] else 0.0
+        cost_med = statistics.median(m["costs"]) if m["costs"] else 0.0
+        in_mean = statistics.mean(m["in_tokens"]) if m["in_tokens"] else 0
+        out_mean = statistics.mean(m["out_tokens"]) if m["out_tokens"] else 0
+        lat_p50 = statistics.median(m["latency_ms"]) if m["latency_ms"] else 0
+        f1 = per_arm_art.get(arm, {}).get("f1_mean", 0.0) * 100
+        print(
+            f"{arm:<25} {m['n']:>4} {acc:>5.1f}% {f1:>5.1f}% {fail:>5} "
+            f"${cost_mean:>9.4f} ${cost_med:>9.4f} {in_mean:>12.0f} {out_mean:>12.0f} {lat_p50:>8.0f}"
+        )
+
+    print()
+    print("by answer_format (accuracy):")
+    formats = sorted({f for m in arm_metrics.values() for f in m["by_format"].keys()})
+    header = f"{'arm':<25} " + " ".join(f"{f:>10}" for f in formats)
+    print(header)
+    print("-" * len(header))
+    for arm, m in sorted(arm_metrics.items()):
+        cells = []
+        for f in formats:
+            row = m["by_format"][f]
+            if row["n"] == 0:
+                cells.append(f"{'-':>10}")
+            else:
+                pct = row["correct"] / row["n"] * 100
+                cells.append(f"{pct:>5.0f}% ({row['correct']:>2}/{row['n']:>2})")
+        print(f"{arm:<25} " + " ".join(cells))
+
+    print()
+    print("=" * 100)
+    print("Aggregated cost (from run_artifact.json):")
+    for arm, row in per_arm_art.items():
+        print(
+            f"  {arm:<25}  acc={row['accuracy']*100:5.1f}% "
+            f"  $/Q LLM={row['llm_cost_per_q']:.4f}  "
+            f"  preprocess total=${row['preprocess_cost_total']:.2f}  "
+            f"  $/Q total={row['total_cost_per_q']:.4f}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/test_context_overflow_hypothesis.py
+++ b/surfsense_evals/scripts/test_context_overflow_hypothesis.py
@ -0,0 +1,155 @@
+"""Test the hypothesis: were the LC-arm errors actually context-window
+overflow errors disguised as SSL / network failures?
+
+If true, we'd expect:
+  (a) literal "prompt is too long" / "context_length_exceeded" / "exceeds .* tokens" strings,
+  (b) failures correlated with extraction size / input_tokens (large doc -> failure),
+  (c) failing requests near or over Sonnet 4.5's 200k input-token limit.
+
+If false (transport-layer hypothesis), we'd expect:
+  (a) only SSL / 502 / empty stream / JSONDecode strings,
+  (b) failures NOT correlated with size (uniform across PDFs by time, not by tokens),
+  (c) failing requests well below the 200k limit.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+MANIFEST = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+
+CONTEXT_HINTS = (
+    "context_length",
+    "context window",
+    "prompt is too long",
+    "exceeds",
+    "maximum context",
+    "input tokens",
+    "too many tokens",
+    "over the maximum",
+    "200000",
+    "200_000",
+)
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    extraction_size: dict[tuple[str, str], int] = {}
+    for line in MANIFEST.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        m = json.loads(line)
+        for arm, ext in (m.get("extractions") or {}).items():
+            extraction_size[(m["doc_id"], arm)] = int(ext.get("chars") or 0)
+
+    print("=" * 80)
+    print("(a) Literal 'context window' / 'prompt too long' error strings?")
+    print("=" * 80)
+    found = 0
+    for row in rows:
+        err = (row.get("error") or "").lower()
+        if not err:
+            continue
+        for hint in CONTEXT_HINTS:
+            if hint in err:
+                print(f"  {row['arm']:<25} {row['qid']:<50}")
+                print(f"      -> {err[:240]}")
+                found += 1
+                break
+    if not found:
+        print("  none found.")
+
+    print()
+    print("=" * 80)
+    print("(b) Extraction size for OK vs FAILED rows per arm")
+    print("=" * 80)
+    arm_buckets: dict[str, dict[str, list[int]]] = defaultdict(
+        lambda: {"ok": [], "fail": []}
+    )
+    parser_arms = (
+        "azure_basic_lc", "azure_premium_lc",
+        "llamacloud_basic_lc", "llamacloud_premium_lc",
+    )
+    for row in rows:
+        arm = row["arm"]
+        if arm not in parser_arms:
+            continue
+        size = extraction_size.get((row["doc_id"], arm), 0)
+        bucket = "fail" if (row.get("error") or not (row.get("raw_text") or "").strip()) else "ok"
+        arm_buckets[arm][bucket].append(size)
+
+    print(f"{'arm':<25} {'bucket':<5} {'n':>4} {'mean chars':>12} {'median':>10} {'max':>10}")
+    for arm in parser_arms:
+        for bucket in ("ok", "fail"):
+            sizes = arm_buckets[arm][bucket]
+            if not sizes:
+                print(f"  {arm:<23} {bucket:<5} {0:>4}  -")
+                continue
+            print(
+                f"  {arm:<23} {bucket:<5} {len(sizes):>4} "
+                f"{statistics.mean(sizes):>12,.0f} "
+                f"{statistics.median(sizes):>10,.0f} "
+                f"{max(sizes):>10,}"
+            )
+
+    print()
+    print("=" * 80)
+    print("(c) Largest extraction each arm processed *successfully* vs *failed*")
+    print("=" * 80)
+    print(
+        "(Sonnet 4.5 input limit ~200k tokens ~= 800k chars. If failures were "
+        "context-overflow, max-OK would be near that cap. If max-OK is well "
+        "above max-FAIL, the model handled bigger contexts than the failed "
+        "ones, so size cannot be the cause.)"
+    )
+    print()
+    for arm in parser_arms:
+        ok_sizes = arm_buckets[arm]["ok"]
+        fail_sizes = arm_buckets[arm]["fail"]
+        if not ok_sizes:
+            continue
+        max_ok = max(ok_sizes)
+        max_fail = max(fail_sizes) if fail_sizes else 0
+        print(
+            f"  {arm:<25} max OK = {max_ok:>10,} chars (~{max_ok / 4:>7,.0f} tokens)  "
+            f"max FAIL = {max_fail:>10,} chars (~{max_fail / 4:>7,.0f} tokens)"
+        )
+
+    print()
+    print("=" * 80)
+    print("(d) Did the *known* overflow candidate fail?")
+    print("=" * 80)
+    print(
+        "  3M_2018_10K x llamacloud_premium = 908,733 chars (~227k tokens) "
+        "-- this is above Sonnet 4.5's 200k window."
+    )
+    print("  If transport hypothesis is correct, this should still fail with a "
+          "real overflow error.")
+    print("  If transport hypothesis is correct AND the model truncates silently, "
+          "it might 'succeed' but be wrong.")
+    print()
+    for row in rows:
+        if row["doc_id"] != "3M_2018_10K.pdf":
+            continue
+        if row["arm"] != "llamacloud_premium_lc":
+            continue
+        err = row.get("error") or "(none)"
+        graded = row.get("graded") or {}
+        print(
+            f"  {row['qid']:<40} correct={graded.get('correct')!s:<5}  "
+            f"err={err[:100]}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/src/surfsense_evals/init.py
+++ b/surfsense_evals/src/surfsense_evals/init.py
@ -0,0 +1,10 @@
+"""SurfSense Evals — domain-agnostic eval harness.
+
+Public entry-point is the ``surfsense_evals`` CLI (``python -m surfsense_evals``).
+Programmatic embedding is a non-goal for now; everything goes through the CLI
+ filesystem outputs (state.json, raw run JSONL, summary.md/json reports).
+"""
+
+from __future__ import annotations
+
+__version__ = "0.1.0"
--- a/surfsense_evals/src/surfsense_evals/main.py
+++ b/surfsense_evals/src/surfsense_evals/main.py
@ -0,0 +1,13 @@
+"""Module entry point: ``python -m surfsense_evals ...``.
+
+Delegates to ``core.cli.main``. ``core.cli`` lazily imports
+``surfsense_evals.suites`` so every benchmark gets a chance to register
+before argparse builds its subcommand groups.
+"""
+
+from __future__ import annotations
+
+from surfsense_evals.core.cli import main
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())
--- a/surfsense_evals/src/surfsense_evals/core/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/init.py
@ -0,0 +1,8 @@
+"""Domain-agnostic infrastructure shared by every suite.
+
+Nothing under ``core/`` knows or cares about a specific evaluation domain.
+Suites live under ``surfsense_evals.suites.<domain>.<benchmark>`` and
+register themselves with ``core.registry`` on import.
+"""
+
+from __future__ import annotations
--- a/surfsense_evals/src/surfsense_evals/core/arms/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/arms/init.py
@ -0,0 +1,44 @@
+"""Arm protocol + concrete arms shared across suites.
+
+Concrete arms (``NativePdfArm``, ``SurfSenseArm``, ``BareLlmArm``) are
+imported lazily via ``__getattr__`` so consumers that only need the
+protocol — e.g. the registry's ``Arm`` re-export — don't transitively
+pull in ``httpx`` providers or the SurfSense client unless they
+actually use those arms.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .base import Arm, ArmRequest, ArmResult
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .bare_llm import BareLlmArm
+    from .native_pdf import NativePdfArm
+    from .surfsense import SurfSenseArm
+
+__all__ = [
+    "Arm",
+    "ArmRequest",
+    "ArmResult",
+    "BareLlmArm",
+    "NativePdfArm",
+    "SurfSenseArm",
+]
+
+
+def __getattr__(name: str):  # PEP 562
+    if name == "NativePdfArm":
+        from .native_pdf import NativePdfArm
+
+        return NativePdfArm
+    if name == "SurfSenseArm":
+        from .surfsense import SurfSenseArm
+
+        return SurfSenseArm
+    if name == "BareLlmArm":
+        from .bare_llm import BareLlmArm
+
+        return BareLlmArm
+    raise AttributeError(f"module 'surfsense_evals.core.arms' has no attribute {name!r}")
--- a/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py
+++ b/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py
@ -0,0 +1,100 @@
+"""Bare-LLM arm: chat completion with prompt-only input, no retrieval.
+
+Pairs with ``SurfSenseArm`` for any benchmark that wants to measure
+"how much does the model already know without RAG?". For factuality /
+multi-hop benchmarks (FRAMES, MuSiQue, …) this produces the published
+"naive prompting" baseline — e.g. FRAMES's 40.8% on Gemini-Pro-1.5.
+
+Symmetric with ``NativePdfArm`` in shape, but the request carries no
+``pdf_paths``: the prompt itself is the only input the model gets.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..providers.openrouter_chat import OpenRouterChatProvider
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class BareLlmArm(Arm):
+    """``Arm`` implementation backed by ``OpenRouterChatProvider``.
+
+    ``name`` defaults to ``"bare_llm"`` but is overridable per-instance.
+    Suites that want two distinct OpenRouter chat arms (e.g. CRAG's
+    ``bare_llm`` vs ``long_context`` — both backed by chat-completions
+    but exercising different prompt strategies) instantiate twice with
+    different names so the metrics aggregator can keep them separate.
+    """
+
+    name: str = "bare_llm"
+
+    def __init__(
+        self,
+        *,
+        provider: OpenRouterChatProvider,
+        max_output_tokens: int | None = 1024,
+        system_prompt: str | None = None,
+        name: str | None = None,
+    ) -> None:
+        self._provider = provider
+        self._max_output = max_output_tokens
+        self._system_prompt = system_prompt
+        if name:
+            self.name = name
+
+    @classmethod
+    def from_env(
+        cls,
+        *,
+        api_key: str,
+        model: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        max_output_tokens: int | None = 1024,
+        system_prompt: str | None = None,
+        name: str | None = None,
+    ) -> BareLlmArm:
+        provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=base_url,
+            model=model,
+        )
+        return cls(
+            provider=provider,
+            max_output_tokens=max_output_tokens,
+            system_prompt=system_prompt,
+            name=name,
+        )
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        try:
+            response = await self._provider.complete(
+                prompt=request.prompt,
+                system_prompt=self._system_prompt,
+                max_tokens=self._max_output,
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+            )
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=response.text,
+            input_tokens=response.input_tokens,
+            output_tokens=response.output_tokens,
+            cost_micros=response.cost_micros,
+            latency_ms=response.latency_ms,
+            extra={
+                "model": self._provider.model,
+                "finish_reason": response.finish_reason,
+            },
+        )
+
+
+__all__ = ["BareLlmArm"]
--- a/surfsense_evals/src/surfsense_evals/core/arms/base.py
+++ b/surfsense_evals/src/surfsense_evals/core/arms/base.py
@ -0,0 +1,93 @@
+"""Arm protocol + the value types every arm exchanges with a runner.
+
+An ``Arm`` is "one way to answer one question". Two ship in this PR:
+
+* ``NativePdfArm`` — drop the PDF straight into an OpenRouter
+  chat-completions request with ``plugins=[{file-parser, engine:
+  native}]``. Used for the head-to-head "is the model good enough on
+  its own?" measurement.
+* ``SurfSenseArm`` — POST ``/api/v1/new_chat`` with the question
+  scoped to the relevant ``mentioned_document_ids``; consume the SSE
+  stream and parse citations.
+
+Both implement the same protocol so a benchmark runner only sees
+``Arm.answer(request) -> ArmResult``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Protocol
+
+
+@dataclass
+class ArmRequest:
+    """One arm-call worth of input.
+
+    * ``question_id`` is opaque — used for logging and joining results.
+    * ``prompt`` is the fully-formatted text the arm should send. The
+      runner is responsible for prompt construction so head-to-head
+      comparisons use byte-identical text.
+    * ``pdf_paths`` is the per-question source PDFs (used by
+      ``NativePdfArm``). Empty for retrieval-only / corpus-wide
+      benchmarks.
+    * ``mentioned_document_ids`` is the SurfSense document scoping list
+      (used by ``SurfSenseArm``). When ``None`` SurfSense retrieves
+      across the whole search space.
+    * ``options`` is a free-form bag of arm-specific overrides
+      (e.g. SurfSense's ``disabled_tools``).
+    """
+
+    question_id: str
+    prompt: str
+    pdf_paths: list[Path] = field(default_factory=list)
+    mentioned_document_ids: list[int] | None = None
+    options: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ArmResult:
+    """Outcome of one ``Arm.answer`` invocation."""
+
+    arm: str
+    question_id: str
+    raw_text: str
+    answer_letter: str | None = None
+    citations: list[dict[str, Any]] = field(default_factory=list)
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_micros: int = 0
+    latency_ms: int = 0
+    error: str | None = None
+    extra: dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def ok(self) -> bool:
+        return self.error is None
+
+    def to_jsonl(self) -> dict[str, Any]:
+        """Stable dict shape for ``data/<suite>/runs/<ts>/<bench>_raw.jsonl``."""
+
+        return {
+            "arm": self.arm,
+            "question_id": self.question_id,
+            "answer_letter": self.answer_letter,
+            "raw_text": self.raw_text,
+            "citations": self.citations,
+            "input_tokens": self.input_tokens,
+            "output_tokens": self.output_tokens,
+            "cost_micros": self.cost_micros,
+            "latency_ms": self.latency_ms,
+            "error": self.error,
+            "extra": self.extra,
+        }
+
+
+class Arm(Protocol):
+    """One concrete way to answer questions for a given run."""
+
+    name: str
+
+    async def answer(self, request: ArmRequest) -> ArmResult:  # pragma: no cover - protocol
+        ...
--- a/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py
+++ b/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py
@ -0,0 +1,104 @@
+"""Native-PDF arm: drop the PDF straight into OpenRouter chat-completions.
+
+Generic across suites — a benchmark just supplies the prompt and the
+single PDF path. Multi-PDF questions concatenate in the runner before
+calling this arm so each ``answer`` invocation feeds the model exactly
+one ``data:application/pdf;base64,...`` block (matches the human
+"drag-and-drop one PDF into Claude" intent).
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..parse.answer_letter import extract_answer_letter
+from ..providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class NativePdfArm(Arm):
+    """``Arm`` implementation backed by ``OpenRouterPdfProvider``."""
+
+    name: str = "native_pdf"
+
+    def __init__(
+        self,
+        *,
+        provider: OpenRouterPdfProvider,
+        max_output_tokens: int | None = 1024,
+    ) -> None:
+        self._provider = provider
+        self._max_output = max_output_tokens
+
+    @classmethod
+    def from_env(
+        cls,
+        *,
+        api_key: str,
+        model: str,
+        engine: PdfEngine = PdfEngine.NATIVE,
+        base_url: str = "https://openrouter.ai/api/v1",
+        max_output_tokens: int | None = 1024,
+    ) -> NativePdfArm:
+        provider = OpenRouterPdfProvider(
+            api_key=api_key,
+            base_url=base_url,
+            model=model,
+            engine=engine,
+        )
+        return cls(provider=provider, max_output_tokens=max_output_tokens)
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        if not request.pdf_paths:
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error="native_pdf arm requires at least one pdf_path",
+            )
+        if len(request.pdf_paths) > 1:
+            # The plan calls out one-PDF-per-question so the head-to-head
+            # is fair; runners are responsible for upstream concatenation.
+            logger.debug(
+                "qid=%s native_pdf got %d pdfs; using first only",
+                request.question_id,
+                len(request.pdf_paths),
+            )
+        pdf = request.pdf_paths[0]
+        try:
+            response = await self._provider.complete(
+                prompt=request.prompt,
+                pdf_path=pdf,
+                max_tokens=self._max_output,
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+            )
+
+        letter = extract_answer_letter(response.text)
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=response.text,
+            answer_letter=letter.letter,
+            input_tokens=response.input_tokens,
+            output_tokens=response.output_tokens,
+            cost_micros=response.cost_micros,
+            latency_ms=response.latency_ms,
+            extra={
+                "model": self._provider.model,
+                "engine": self._provider.engine.value,
+                "answer_letter_strategy": letter.strategy,
+                "finish_reason": response.finish_reason,
+                "pdf_filename": pdf.name,
+            },
+        )
+
+
+__all__ = ["NativePdfArm"]
--- a/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py
+++ b/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py
@ -0,0 +1,104 @@
+"""SurfSense arm: per-question fresh thread + ``/api/v1/new_chat`` stream.
+
+For every question:
+
+* Create a fresh ``NewChatThread`` on the suite's pinned SearchSpace.
+  This sidesteps the per-thread ``THREAD_BUSY`` 409 (a single thread
+  serialises turns, see ``surfsense_backend/app/routes/new_chat_routes.py:191-220``).
+* POST ``/api/v1/new_chat`` with the prompt and the per-question
+  ``mentioned_document_ids`` (``surfsense_backend/app/schemas/new_chat.py:241-243``).
+* Consume the SSE stream via ``NewChatClient.ask`` which accumulates
+  text deltas and returns ``StreamedAnswer``.
+* Optionally delete the thread (default ON for ephemeral runs).
+
+Citations are parsed from the streamed assistant text via the
+canonical regex port; chunk ids are returned in ``ArmResult.citations``
+for the runner to map back to corpus ids.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..clients import NewChatClient
+from ..parse.answer_letter import extract_answer_letter
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class SurfSenseArm(Arm):
+    """``Arm`` implementation backed by ``NewChatClient``."""
+
+    name: str = "surfsense"
+
+    def __init__(
+        self,
+        *,
+        client: NewChatClient,
+        search_space_id: int,
+        ephemeral_threads: bool = True,
+        thread_title_prefix: str = "eval",
+    ) -> None:
+        self._client = client
+        self._search_space_id = search_space_id
+        self._ephemeral = ephemeral_threads
+        self._title_prefix = thread_title_prefix
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        thread_id: int | None = None
+        try:
+            thread_id = await self._client.create_thread(
+                search_space_id=self._search_space_id,
+                title=f"{self._title_prefix}:{request.question_id}",
+            )
+            answer = await self._client.ask(
+                thread_id=thread_id,
+                search_space_id=self._search_space_id,
+                user_query=request.prompt,
+                mentioned_document_ids=request.mentioned_document_ids,
+                disabled_tools=request.options.get("disabled_tools"),
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+                extra={"thread_id": thread_id},
+            )
+        finally:
+            if self._ephemeral and thread_id is not None:
+                try:
+                    await self._client.delete_thread(thread_id)
+                except Exception as exc:  # noqa: BLE001
+                    logger.debug(
+                        "Failed to delete thread %s: %s", thread_id, exc
+                    )
+
+        letter = extract_answer_letter(answer.text)
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=answer.text,
+            answer_letter=letter.letter,
+            citations=answer.citations,
+            latency_ms=answer.latency_ms,
+            # SurfSense doesn't surface input/output token counts in the
+            # SSE stream today; leaving the cost / token fields at 0
+            # documents that gap. Estimating from the raw text would
+            # bias the comparison against the SurfSense arm.
+            extra={
+                "thread_id": thread_id,
+                "search_space_id": self._search_space_id,
+                "answer_letter_strategy": letter.strategy,
+                "user_message_id": answer.user_message_id,
+                "assistant_message_id": answer.assistant_message_id,
+                "finished_normally": answer.finished_normally,
+                "n_raw_events": len(answer.raw_events),
+                "n_mentioned_documents": len(request.mentioned_document_ids or []),
+            },
+        )
+
+
+__all__ = ["SurfSenseArm"]
--- a/surfsense_evals/src/surfsense_evals/core/auth.py
+++ b/surfsense_evals/src/surfsense_evals/core/auth.py
@ -0,0 +1,273 @@
+"""Dual-mode credential resolver + httpx client factory with 401 auto-refresh.
+
+SurfSense supports ``AUTH_TYPE=LOCAL`` (email + password) and
+``AUTH_TYPE=GOOGLE`` (Google OAuth → frontend stores JWT in ``localStorage``).
+There is no headless equivalent of the Google flow, so the harness handles
+both modes by treating the JWT as the universal credential:
+
+* **LOCAL**: harness POSTs form-encoded ``username`` + ``password`` to
+  ``/auth/jwt/login``, reads ``{access_token, refresh_token}``.
+* **GOOGLE / pre-issued JWT**: operator pastes their existing JWT (and
+  optionally refresh token) into ``SURFSENSE_JWT`` /
+  ``SURFSENSE_REFRESH_TOKEN``; harness skips login.
+
+Either way ``client_with_auth`` returns one shared
+``httpx.AsyncClient`` with ``Authorization: Bearer <jwt>`` set and an
+event hook that, on a 401 with a refresh token in scope, calls
+``POST /auth/jwt/refresh`` and retries the original request once. JWT
+lifetime defaults to one day backend-side, so this matters for long
+MIRAGE runs.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+
+from .config import Config
+
+logger = logging.getLogger(__name__)
+
+
+class CredentialError(RuntimeError):
+    """Raised when no credential mode is configured."""
+
+
+_NO_CREDENTIALS_MESSAGE = (
+    "No SurfSense credentials configured. Set ONE of:\n"
+    "  (LOCAL)  SURFSENSE_USER_EMAIL + SURFSENSE_USER_PASSWORD\n"
+    "  (GOOGLE) SURFSENSE_JWT (and optionally SURFSENSE_REFRESH_TOKEN)\n"
+    "For GOOGLE: log in to SurfSense in your browser, open DevTools → "
+    "Application → Local Storage → copy `surfsense_bearer_token` and "
+    "`surfsense_refresh_token` into those env vars."
+)
+
+
+@dataclass
+class TokenBundle:
+    """Mutable token state — refresh hook updates ``access_token`` in place."""
+
+    access_token: str
+    refresh_token: str | None = None
+    # ``mode`` is informational only ("local" or "jwt"); used in error messages.
+    mode: str = "jwt"
+
+
+# ---------------------------------------------------------------------------
+# Token acquisition
+# ---------------------------------------------------------------------------
+
+
+async def acquire_token(config: Config, *, http: httpx.AsyncClient | None = None) -> TokenBundle:
+    """Resolve credentials → ``TokenBundle``.
+
+    Precedence:
+
+    1. ``SURFSENSE_JWT`` set → use it directly. Refresh token captured if
+       supplied.
+    2. ``SURFSENSE_USER_EMAIL`` + ``SURFSENSE_USER_PASSWORD`` set →
+       form-encoded POST to ``/auth/jwt/login``.
+    3. Neither → raise ``CredentialError``.
+
+    The optional ``http`` argument lets tests inject a mocked client; if
+    omitted a one-shot client is created for the login call only.
+    """
+
+    if config.has_jwt_mode():
+        return TokenBundle(
+            access_token=config.surfsense_jwt or "",
+            refresh_token=config.surfsense_refresh_token,
+            mode="jwt",
+        )
+
+    if config.has_local_mode():
+        async def _login(client: httpx.AsyncClient) -> TokenBundle:
+            response = await client.post(
+                f"{config.surfsense_api_base}/auth/jwt/login",
+                data={
+                    "username": config.surfsense_user_email,
+                    "password": config.surfsense_user_password,
+                },
+                headers={"Accept": "application/json"},
+            )
+            if response.status_code != 200:
+                raise CredentialError(
+                    f"LOCAL login failed (HTTP {response.status_code}): "
+                    f"{_safe_text(response)}"
+                )
+            payload = response.json()
+            access = payload.get("access_token")
+            if not access:
+                raise CredentialError(
+                    f"LOCAL login response missing access_token: {payload!r}"
+                )
+            return TokenBundle(
+                access_token=access,
+                refresh_token=payload.get("refresh_token") or None,
+                mode="local",
+            )
+
+        if http is not None:
+            return await _login(http)
+        async with httpx.AsyncClient(timeout=httpx.Timeout(30.0, connect=10.0)) as client:
+            return await _login(client)
+
+    raise CredentialError(_NO_CREDENTIALS_MESSAGE)
+
+
+def _safe_text(response: httpx.Response, *, limit: int = 200) -> str:
+    body = response.text or ""
+    if len(body) > limit:
+        return body[:limit] + "…"
+    return body
+
+
+# ---------------------------------------------------------------------------
+# httpx client + 401 auto-refresh
+# ---------------------------------------------------------------------------
+
+
+class _AuthState:
+    """Shared mutable holder closed over by the auth event hook.
+
+    Kept private so callers can't accidentally mutate the access token
+    out-of-band; ``client_with_auth`` returns the client directly.
+    """
+
+    def __init__(self, config: Config, tokens: TokenBundle) -> None:
+        self.config = config
+        self.tokens = tokens
+        self._refresh_in_flight: bool = False
+
+
+def _build_auth_request(state: _AuthState, request: httpx.Request) -> None:
+    """Stamp the current bearer onto ``request`` (request-event hook)."""
+
+    request.headers["Authorization"] = f"Bearer {state.tokens.access_token}"
+
+
+async def _refresh_access_token(
+    state: _AuthState, transport: httpx.AsyncBaseTransport | None = None
+) -> bool:
+    """POST ``/auth/jwt/refresh`` with the current refresh token.
+
+    Returns ``True`` on success and updates ``state.tokens`` in place.
+    Returns ``False`` if no refresh token is configured or the call fails.
+    Recursive 401s are avoided by using a *new* client without the auth
+    hook.
+    """
+
+    refresh = state.tokens.refresh_token
+    if not refresh:
+        return False
+    try:
+        async with httpx.AsyncClient(
+            timeout=httpx.Timeout(15.0, connect=5.0),
+            transport=transport,
+        ) as inner:
+            response = await inner.post(
+                f"{state.config.surfsense_api_base}/auth/jwt/refresh",
+                json={"refresh_token": refresh},
+                headers={"Accept": "application/json"},
+            )
+    except httpx.HTTPError as exc:
+        logger.warning("Token refresh transport error: %s", exc)
+        return False
+    if response.status_code != 200:
+        logger.warning(
+            "Token refresh rejected (HTTP %s): %s",
+            response.status_code,
+            _safe_text(response),
+        )
+        return False
+    payload = response.json()
+    new_access = payload.get("access_token")
+    if not new_access:
+        logger.warning("Refresh response missing access_token: %r", payload)
+        return False
+    state.tokens.access_token = new_access
+    new_refresh = payload.get("refresh_token")
+    if new_refresh:
+        state.tokens.refresh_token = new_refresh
+    return True
+
+
+def client_with_auth(
+    config: Config,
+    tokens: TokenBundle,
+    *,
+    timeout: float = 60.0,
+    transport: httpx.AsyncBaseTransport | None = None,
+    base_url: str | None = None,
+) -> httpx.AsyncClient:
+    """Build a single shared ``httpx.AsyncClient`` for the SurfSense API.
+
+    * Stamps ``Authorization: Bearer <jwt>`` on every outgoing request.
+    * On any 401 response, attempts a single refresh (if a refresh token
+      is configured) and retries the original request once. The retry
+      uses a fresh stamping of the bearer header, so a successful
+      refresh transparently unblocks long runs.
+    * The retry is best-effort — repeated 401s after a refresh attempt
+      are surfaced to the caller so they can re-auth manually.
+
+    Pass ``base_url`` to scope a sub-client (e.g. tests). The default
+    keeps full URLs in calling code, which makes route-spec citations in
+    the codebase easier to grep.
+    """
+
+    state = _AuthState(config, tokens)
+
+    async def _request_hook(request: httpx.Request) -> None:
+        _build_auth_request(state, request)
+
+    # ``send`` is overridden in ``_AuthAwareClient`` to retry once on 401
+    # after refreshing the bearer. httpx's response event-hook can't
+    # *replace* a response, so we need a subclass to do the replay.
+    client = _AuthAwareClient(
+        state=state,
+        transport=transport,
+        timeout=httpx.Timeout(timeout, connect=10.0),
+        base_url=base_url or "",
+        event_hooks={"request": [_request_hook]},
+    )
+    return client
+
+
+class _AuthAwareClient(httpx.AsyncClient):
+    """``AsyncClient`` that retries once on 401 after refreshing the token."""
+
+    def __init__(self, *, state: _AuthState, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self._auth_state = state
+
+    async def send(  # type: ignore[override]
+        self, request: httpx.Request, **kwargs: Any
+    ) -> httpx.Response:
+        response = await super().send(request, **kwargs)
+        if response.status_code != 401:
+            return response
+        # Don't refresh while a refresh is itself in flight.
+        if self._auth_state._refresh_in_flight:
+            return response
+        self._auth_state._refresh_in_flight = True
+        try:
+            refreshed = await _refresh_access_token(self._auth_state)
+        finally:
+            self._auth_state._refresh_in_flight = False
+        if not refreshed:
+            return response
+        # Re-stamp and replay once. ``request`` is reusable.
+        await response.aclose()
+        request.headers["Authorization"] = f"Bearer {self._auth_state.tokens.access_token}"
+        return await super().send(request, **kwargs)
+
+
+__all__ = [
+    "CredentialError",
+    "TokenBundle",
+    "acquire_token",
+    "client_with_auth",
+]
--- a/surfsense_evals/src/surfsense_evals/core/cli.py
+++ b/surfsense_evals/src/surfsense_evals/core/cli.py
@ -0,0 +1,790 @@
+"""Argparse CLI for ``python -m surfsense_evals``.
+
+Subcommands:
+
+* ``setup    --suite <name> --provider-model <slug> [--agent-llm-id <int>]``
+* ``teardown --suite <name>``
+* ``models  list [--provider openrouter] [--grep <s>]``
+* ``suites  list``
+* ``benchmarks list [--suite <name>]``
+* ``ingest <suite> <benchmark> [benchmark flags]``
+* ``run    <suite> <benchmark> [benchmark flags]``
+* ``report --suite <name> [--benchmark <name>]``
+
+The ``ingest`` / ``run`` subparsers are built dynamically from the
+registry — adding a new benchmark only requires registering it; the
+CLI surface comes for free. ``add_run_args`` lets each benchmark
+publish its own flags.
+
+Design choices worth flagging:
+
+* ``setup`` rejects ``agent_llm_id == 0`` (Auto / LiteLLM router) so
+  per-question accuracy is reproducible.
+* ``setup`` validates that the picked LLM config has
+  ``provider == "OPENROUTER"`` and ``model_name == --provider-model``
+  before declaring success — both arms of the head-to-head must hit
+  the same OpenRouter slug.
+* Lifecycle state is keyed by suite, so ``setup --suite legal`` does
+  not touch ``medical``'s SearchSpace, and vice versa.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Any
+
+import sys
+
+import httpx
+from rich.console import Console
+from rich.table import Table
+
+# Windows' legacy console (cp1252) crashes when Rich tries to write characters
+# outside the active codepage (e.g. '->', em-dashes, box-drawing). Force UTF-8
+# on stdout/stderr and disable Rich's legacy_windows render path so the file
+# stream is used directly. Modern Windows (>=10, VS Code terminal, Windows
+# Terminal, PowerShell, cmd) all interpret ANSI escapes natively.
+if sys.platform == "win32":
+    for _stream in (sys.stdout, sys.stderr):
+        try:
+            _stream.reconfigure(encoding="utf-8", errors="replace")
+        except (AttributeError, ValueError):
+            pass
+
+from . import registry
+from .auth import CredentialError, acquire_token, client_with_auth
+from .clients import SearchSpaceClient
+from .clients.search_space import LlmPreferences
+from .config import (
+    DEFAULT_SCENARIO,
+    SCENARIOS,
+    Config,
+    SuiteState,
+    clear_suite_state,
+    get_suite_state,
+    load_config,
+    set_suite_state,
+    utc_iso_timestamp,
+)
+from .vision_llm import VisionConfigError, resolve_vision_llm
+
+logger = logging.getLogger("surfsense_evals")
+console = Console(legacy_windows=False)
+
+
+# ---------------------------------------------------------------------------
+# Discovery
+# ---------------------------------------------------------------------------
+
+
+def _discover_suites() -> list[str]:
+    """Trigger ``register(...)`` for every benchmark.
+
+    Imported lazily so ``models list`` (which doesn't need any
+    benchmark) still runs fast.
+    """
+
+    from surfsense_evals.suites import discover_suites
+
+    return discover_suites()
+
+
+# ---------------------------------------------------------------------------
+# Global LLM config fetcher (used by setup + models list)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LlmConfigEntry:
+    id: int
+    name: str
+    provider: str
+    model_name: str
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> LlmConfigEntry:
+        return cls(
+            id=int(payload["id"]),
+            name=str(payload.get("name", "")),
+            provider=str(payload.get("provider", "")).upper(),
+            model_name=str(payload.get("model_name", "")),
+            raw=payload,
+        )
+
+
+async def _list_global_llm_configs(http: httpx.AsyncClient, base: str) -> list[LlmConfigEntry]:
+    response = await http.get(
+        f"{base}/api/v1/global-new-llm-configs",
+        headers={"Accept": "application/json"},
+    )
+    response.raise_for_status()
+    payload = response.json()
+    if not isinstance(payload, list):
+        raise RuntimeError(f"Unexpected /global-new-llm-configs payload: {payload!r}")
+    return [LlmConfigEntry.from_payload(item) for item in payload]
+
+
+def _resolve_openrouter_id(
+    candidates: list[LlmConfigEntry],
+    provider_model: str,
+    *,
+    explicit_id: int | None,
+) -> int:
+    """Resolve the SurfSense LLM id for ``provider_model``.
+
+    Behaviour:
+
+    * If ``explicit_id`` is given: return it directly. The caller is
+      then expected to GET-validate that the row's
+      ``provider == "OPENROUTER"`` and ``model_name`` matches the slug.
+      That branch supports positive BYOK ``NewLLMConfig`` rows whose
+      slugs may overlap with global OpenRouter virtuals.
+    * Otherwise: filter to ``provider == "OPENROUTER"`` and
+      ``model_name == provider_model``. Expect exactly one match —
+      raise with a friendly message otherwise.
+    """
+
+    if explicit_id is not None:
+        return explicit_id
+
+    matches = [
+        c for c in candidates if c.provider == "OPENROUTER" and c.model_name == provider_model
+    ]
+    if not matches:
+        sample = ", ".join(
+            f"{c.model_name} (id={c.id})" for c in candidates if c.provider == "OPENROUTER"
+        )[:600]
+        raise RuntimeError(
+            f"No OpenRouter config found for slug '{provider_model}'. "
+            "Make sure `openrouter_integration.enabled: true` in "
+            "global_llm_config.yaml and that the Celery worker has "
+            "finished its first refresh (the catalogue is fetched at "
+            "Celery startup per `app/celery_app.py`). "
+            f"Available OpenRouter slugs (sample): {sample or '<none>'}.\n"
+            "Browse with: python -m surfsense_evals models list --grep <substring>"
+        )
+    if len(matches) > 1:
+        listing = "\n".join(f"  id={c.id}  name={c.name!r}" for c in matches)
+        raise RuntimeError(
+            f"Multiple OpenRouter configs for slug '{provider_model}':\n{listing}\n"
+            "Pass --agent-llm-id <id> to disambiguate."
+        )
+    return matches[0].id
+
+
+# ---------------------------------------------------------------------------
+# Subcommand implementations
+# ---------------------------------------------------------------------------
+
+
+async def _cmd_setup(args: argparse.Namespace) -> int:
+    suite = args.suite
+    provider_model: str = args.provider_model
+    explicit_id: int | None = args.agent_llm_id
+    scenario: str = args.scenario
+    vision_llm_slug: str | None = args.vision_llm
+    native_arm_model: str | None = args.native_arm_model
+    skip_vision_setup: bool = args.no_vision_llm_setup
+
+    if explicit_id == 0:
+        console.print(
+            "[red]agent_llm_id == 0 (Auto / LiteLLM router) is not allowed — "
+            "results would not be reproducible.[/red]"
+        )
+        return 2
+
+    if scenario not in SCENARIOS:
+        console.print(
+            f"[red]Unknown scenario {scenario!r}. Pick one of: "
+            f"{', '.join(SCENARIOS)}[/red]"
+        )
+        return 2
+
+    # Scenario-specific validation. Each branch documents WHY the rule
+    # exists so the operator's mental model matches what the runner does.
+    if scenario == "cost-arbitrage":
+        if not native_arm_model:
+            console.print(
+                "[red]--scenario cost-arbitrage requires --native-arm-model "
+                "<vision-capable slug>.[/red] The native arm needs a vision "
+                "model to fairly answer image-bearing questions; SurfSense "
+                "answers from already-extracted text via --provider-model."
+            )
+            return 2
+        if native_arm_model == provider_model:
+            console.print(
+                "[yellow]--native-arm-model equals --provider-model in "
+                "cost-arbitrage; that's degenerate (same as head-to-head). "
+                "Pick a different slug or switch to --scenario head-to-head.[/yellow]"
+            )
+    elif scenario in ("head-to-head", "symmetric-cheap"):
+        if native_arm_model:
+            console.print(
+                f"[yellow]--native-arm-model is ignored for --scenario {scenario} "
+                f"(both arms answer with --provider-model={provider_model!r}).[/yellow]"
+            )
+            native_arm_model = None  # don't persist a stale value
+
+    config = load_config()
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    async with client_with_auth(config, token) as http:
+        candidates = await _list_global_llm_configs(http, config.surfsense_api_base)
+
+        try:
+            agent_llm_id = _resolve_openrouter_id(
+                candidates, provider_model, explicit_id=explicit_id
+            )
+        except RuntimeError as exc:
+            console.print(f"[red]{exc}[/red]")
+            return 2
+
+        ss_client = SearchSpaceClient(http, config.surfsense_api_base)
+        existing = get_suite_state(config, suite)
+        if existing is not None:
+            try:
+                row = await ss_client.get(existing.search_space_id)
+                console.print(
+                    f"Reusing existing SearchSpace [cyan]{row.name}[/cyan] "
+                    f"(id={row.id}) for suite [bold]{suite}[/bold]."
+                )
+                search_space_id = row.id
+            except httpx.HTTPStatusError as exc:
+                if exc.response.status_code == 404:
+                    console.print(
+                        f"[yellow]state.json pointed at SearchSpace id={existing.search_space_id} "
+                        f"but backend returned 404; creating a fresh one.[/yellow]"
+                    )
+                    existing = None
+                else:
+                    raise
+        if existing is None:
+            ss_name = f"eval-{suite}-{utc_iso_timestamp()}"
+            row = await ss_client.create(
+                ss_name, description=f"surfsense-evals lifecycle ({suite})"
+            )
+            console.print(
+                f"Created SearchSpace [cyan]{row.name}[/cyan] (id={row.id}) "
+                f"for suite [bold]{suite}[/bold]."
+            )
+            search_space_id = row.id
+
+        # Resolve + attach the vision LLM config (unless explicitly skipped).
+        # Asymmetric scenarios make the vision LLM at ingest a hard
+        # requirement — without it, SurfSense's chunks have no image
+        # content and the entire framing collapses.
+        vision_required = scenario in ("symmetric-cheap", "cost-arbitrage")
+        vision_config_id: int | None = None
+        vision_provider_model: str | None = None
+        if not skip_vision_setup and (vision_required or vision_llm_slug is not None):
+            try:
+                vision_candidates = await ss_client.list_global_vision_llm_configs()
+                resolved = resolve_vision_llm(
+                    vision_candidates, explicit_slug=vision_llm_slug
+                )
+            except VisionConfigError as exc:
+                console.print(f"[red]{exc}[/red]")
+                return 2
+            vision_config_id = resolved.config_id
+            vision_provider_model = resolved.provider_model
+            console.print(
+                f"Vision LLM at ingest: [cyan]{vision_provider_model}[/cyan] "
+                f"(id={vision_config_id}, selected_via={resolved.selected_via})."
+            )
+
+        pref_kwargs: dict[str, Any] = {"agent_llm_id": agent_llm_id}
+        if vision_config_id is not None:
+            pref_kwargs["vision_llm_config_id"] = vision_config_id
+
+        await ss_client.set_llm_preferences(search_space_id, **pref_kwargs)
+        prefs = await ss_client.get_llm_preferences(search_space_id)
+        if not _validate_pin(prefs, provider_model):
+            agent = prefs.agent_llm or {}
+            console.print(
+                f"[red]LLM pin validation FAILED.[/red] After PUT, "
+                f"agent_llm.provider={agent.get('provider')!r}, "
+                f"model_name={agent.get('model_name')!r}; expected "
+                f"provider=OPENROUTER, model_name={provider_model!r}."
+            )
+            return 2
+        if vision_config_id is not None and prefs.vision_llm_config_id != vision_config_id:
+            console.print(
+                f"[red]Vision LLM pin validation FAILED.[/red] After PUT, "
+                f"vision_llm_config_id={prefs.vision_llm_config_id!r}; "
+                f"expected {vision_config_id!r}."
+            )
+            return 2
+
+        suite_state = SuiteState(
+            search_space_id=search_space_id,
+            agent_llm_id=agent_llm_id,
+            provider_model=provider_model,
+            created_at=utc_iso_timestamp(),
+            ingestion_maps=existing.ingestion_maps if existing else {},
+            scenario=scenario,
+            vision_llm_config_id=vision_config_id,
+            vision_provider_model=vision_provider_model,
+            native_arm_model=native_arm_model,
+        )
+        set_suite_state(config, suite, suite_state)
+
+    summary_bits = [
+        f"suite={suite!r}",
+        f"scenario={scenario!r}",
+        f"search_space_id={suite_state.search_space_id}",
+        f"agent_llm_id={suite_state.agent_llm_id}",
+        f"provider_model={suite_state.provider_model!r}",
+    ]
+    if suite_state.vision_provider_model:
+        summary_bits.append(f"vision_provider_model={suite_state.vision_provider_model!r}")
+    if suite_state.native_arm_model:
+        summary_bits.append(f"native_arm_model={suite_state.native_arm_model!r}")
+    console.print(f"[green]setup OK[/green] {' '.join(summary_bits)}")
+    return 0
+
+
+def _validate_pin(prefs: LlmPreferences, provider_model: str) -> bool:
+    agent = prefs.agent_llm or {}
+    return (
+        str(agent.get("provider", "")).upper() == "OPENROUTER"
+        and str(agent.get("model_name", "")) == provider_model
+    )
+
+
+async def _cmd_teardown(args: argparse.Namespace) -> int:
+    suite = args.suite
+    config = load_config()
+    state = get_suite_state(config, suite)
+    if state is None:
+        console.print(f"[yellow]No state for suite {suite!r}; nothing to tear down.[/yellow]")
+        return 0
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+    async with client_with_auth(config, token) as http:
+        ss_client = SearchSpaceClient(http, config.surfsense_api_base)
+        try:
+            await ss_client.delete(state.search_space_id)
+        except httpx.HTTPStatusError as exc:
+            console.print(
+                f"[yellow]DELETE failed (HTTP {exc.response.status_code}); "
+                "clearing state.json anyway.[/yellow]"
+            )
+    clear_suite_state(config, suite)
+    console.print(
+        f"[green]teardown OK[/green] suite={suite!r} "
+        f"(SearchSpace soft-deleted, state.json slot cleared)."
+    )
+    return 0
+
+
+async def _cmd_models_list(args: argparse.Namespace) -> int:
+    config = load_config()
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+    async with client_with_auth(config, token) as http:
+        entries = await _list_global_llm_configs(http, config.surfsense_api_base)
+    grep = (args.grep or "").lower()
+    provider_filter = (args.provider or "").upper()
+    rows: list[LlmConfigEntry] = []
+    for e in entries:
+        if provider_filter and e.provider != provider_filter:
+            continue
+        if grep and grep not in e.model_name.lower() and grep not in e.name.lower():
+            continue
+        rows.append(e)
+    table = Table(
+        title=f"Global LLM configs ({len(rows)} of {len(entries)})",
+        show_lines=False,
+    )
+    table.add_column("id", justify="right", style="cyan")
+    table.add_column("provider", style="magenta")
+    table.add_column("model_name", style="green")
+    table.add_column("name")
+    for e in sorted(rows, key=lambda x: (x.provider, x.model_name)):
+        table.add_row(str(e.id), e.provider, e.model_name, e.name)
+    console.print(table)
+    return 0
+
+
+def _cmd_suites_list(_args: argparse.Namespace) -> int:
+    _discover_suites()
+    suites = registry.list_suites()
+    if not suites:
+        console.print(
+            "[yellow]No suites registered. Drop a benchmark under "
+            "src/surfsense_evals/suites/<domain>/<benchmark>/.[/yellow]"
+        )
+        return 0
+    table = Table(title=f"Registered suites ({len(suites)})")
+    table.add_column("suite", style="bold")
+    table.add_column("benchmarks", style="green")
+    for suite in suites:
+        names = [b.name for b in registry.list_benchmarks(suite)]
+        table.add_row(suite, ", ".join(names) or "<none>")
+    console.print(table)
+    return 0
+
+
+def _cmd_benchmarks_list(args: argparse.Namespace) -> int:
+    _discover_suites()
+    benchmarks = registry.list_benchmarks(args.suite)
+    if not benchmarks:
+        console.print("[yellow]No benchmarks registered.[/yellow]")
+        return 0
+    table = Table(title=f"Benchmarks ({len(benchmarks)})")
+    table.add_column("suite", style="bold")
+    table.add_column("name", style="cyan")
+    table.add_column("headline", justify="center")
+    table.add_column("description")
+    for b in benchmarks:
+        table.add_row(
+            b.suite,
+            b.name,
+            "yes" if b.headline else "no",
+            getattr(b, "description", ""),
+        )
+    console.print(table)
+    return 0
+
+
+async def _cmd_ingest(args: argparse.Namespace) -> int:
+    benchmark = registry.get(args.suite, args.benchmark)
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(
+            f"[red]No setup for suite {args.suite!r}. Run "
+            f"`python -m surfsense_evals setup --suite {args.suite} "
+            f"--provider-model <slug>` first.[/red]"
+        )
+        return 2
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    # Forward parsed CLI flags into ingest() so a benchmark can honour
+    # its own flags (e.g. MIRAGE's --skip-snippet-filter / --corpus).
+    extra_kwargs = {
+        k: v
+        for k, v in vars(args).items()
+        if k not in {"_func", "_async", "command", "subcommand", "suite", "benchmark", "log_level"}
+    }
+    async with client_with_auth(config, token) as http:
+        ctx = registry.RunContext(
+            suite=args.suite,
+            benchmark=args.benchmark,
+            config=config,
+            suite_state=state,
+            http=http,
+        )
+        await benchmark.ingest(ctx, **extra_kwargs)
+    console.print(f"[green]ingest OK[/green] {args.suite}/{args.benchmark}")
+    return 0
+
+
+async def _cmd_run(args: argparse.Namespace) -> int:
+    benchmark = registry.get(args.suite, args.benchmark)
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(
+            f"[red]No setup for suite {args.suite!r}. Run "
+            f"`python -m surfsense_evals setup --suite {args.suite} "
+            f"--provider-model <slug>` first.[/red]"
+        )
+        return 2
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    extra_kwargs = {
+        k: v
+        for k, v in vars(args).items()
+        if k not in {"_func", "_async", "command", "subcommand", "suite", "benchmark", "log_level"}
+    }
+    async with client_with_auth(config, token) as http:
+        ctx = registry.RunContext(
+            suite=args.suite,
+            benchmark=args.benchmark,
+            config=config,
+            suite_state=state,
+            http=http,
+        )
+        artifact = await benchmark.run(ctx, **extra_kwargs)
+
+    console.print(
+        f"[green]run OK[/green] {args.suite}/{args.benchmark} → "
+        f"{artifact.raw_path}"
+    )
+    return 0
+
+
+async def _cmd_report(args: argparse.Namespace) -> int:
+    from .report import write_report
+
+    benchmark_filter = args.benchmark
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(f"[red]No setup for suite {args.suite!r}.[/red]")
+        return 2
+    benchmarks = registry.list_benchmarks(args.suite)
+    if benchmark_filter:
+        benchmarks = [b for b in benchmarks if b.name == benchmark_filter]
+        if not benchmarks:
+            console.print(
+                f"[red]No registered benchmark named {benchmark_filter!r} in suite {args.suite!r}.[/red]"
+            )
+            return 2
+
+    artifacts = _collect_artifacts(config, args.suite, [b.name for b in benchmarks])
+    if not artifacts:
+        console.print(
+            "[yellow]No run artifacts found under "
+            f"{config.suite_runs_dir(args.suite)}. Run a benchmark first.[/yellow]"
+        )
+        return 1
+
+    grouped: dict[str, list[registry.RunArtifact]] = {}
+    for art in artifacts:
+        grouped.setdefault(art.benchmark, []).append(art)
+    sections: list[registry.ReportSection] = []
+    for benchmark in benchmarks:
+        if benchmark.name not in grouped:
+            continue
+        sections.append(benchmark.report_section(grouped[benchmark.name]))
+
+    summary_path = write_report(
+        config=config,
+        suite=args.suite,
+        sections=sections,
+        run_timestamp=utc_iso_timestamp(),
+    )
+    console.print(f"[green]report OK[/green] → {summary_path}")
+    return 0
+
+
+def _collect_artifacts(
+    config: Config, suite: str, benchmark_names: list[str]
+) -> list[registry.RunArtifact]:
+    """Walk ``data/<suite>/runs/*/<benchmark>/`` for the latest artifacts.
+
+    Reads any ``run_artifact.json`` written by a benchmark runner. The
+    runner is responsible for writing this manifest alongside its raw
+    JSONL so the report writer doesn't have to know benchmark-specific
+    metric shapes.
+    """
+
+    runs_dir = config.suite_runs_dir(suite)
+    if not runs_dir.exists():
+        return []
+    artifacts: list[registry.RunArtifact] = []
+    by_bench: dict[str, registry.RunArtifact] = {}
+    for ts_dir in sorted(runs_dir.iterdir()):
+        if not ts_dir.is_dir():
+            continue
+        for bench_name in benchmark_names:
+            bench_dir = ts_dir / bench_name
+            manifest = bench_dir / "run_artifact.json"
+            if not manifest.exists():
+                continue
+            try:
+                with manifest.open("r", encoding="utf-8") as fh:
+                    payload = json.load(fh)
+            except (OSError, json.JSONDecodeError):
+                continue
+            artifact = registry.RunArtifact(
+                suite=suite,
+                benchmark=bench_name,
+                run_timestamp=ts_dir.name,
+                raw_path=bench_dir / payload.get("raw_path", "raw.jsonl"),
+                metrics=payload.get("metrics", {}),
+                extra=payload.get("extra", {}),
+            )
+            # Latest run wins per benchmark.
+            by_bench[bench_name] = artifact
+    artifacts = list(by_bench.values())
+    return artifacts
+
+
+# ---------------------------------------------------------------------------
+# Argparse wiring
+# ---------------------------------------------------------------------------
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="surfsense-evals",
+        description="SurfSense evaluation harness — domain-agnostic core + pluggable suites.",
+    )
+    parser.add_argument(
+        "--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"]
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    p_setup = sub.add_parser("setup", help="Create per-suite SearchSpace + pin LLM.")
+    p_setup.add_argument("--suite", required=True)
+    p_setup.add_argument(
+        "--provider-model",
+        required=True,
+        help=(
+            "OpenRouter slug for the SurfSense answer LLM (and the native arm "
+            "too unless --native-arm-model is set), e.g. "
+            "'anthropic/claude-sonnet-4.5'."
+        ),
+    )
+    p_setup.add_argument(
+        "--agent-llm-id",
+        type=int,
+        default=None,
+        help="Optional override for BYOK NewLLMConfig rows.",
+    )
+    p_setup.add_argument(
+        "--scenario",
+        choices=SCENARIOS,
+        default=DEFAULT_SCENARIO,
+        help=(
+            "head-to-head (default): both arms answer with --provider-model; "
+            "symmetric-cheap: both arms use the same cheap text-only slug, "
+            "SurfSense pre-extracted images at ingest with a vision LLM; "
+            "cost-arbitrage: native arm uses --native-arm-model (vision), "
+            "SurfSense uses --provider-model (cheap, text-only) over chunks "
+            "the vision LLM already extracted at ingest."
+        ),
+    )
+    p_setup.add_argument(
+        "--vision-llm",
+        default=None,
+        metavar="SLUG",
+        help=(
+            "OpenRouter slug for the vision LLM SurfSense uses at ingest "
+            "when --use-vision-llm is on. If omitted in symmetric-cheap / "
+            "cost-arbitrage, the strongest registered vision config is "
+            "auto-picked (priority: claude-sonnet-4.5 > claude-opus-4.7 > "
+            "gpt-5 > gemini-2.5-pro)."
+        ),
+    )
+    p_setup.add_argument(
+        "--native-arm-model",
+        default=None,
+        metavar="SLUG",
+        help=(
+            "Required for --scenario cost-arbitrage. OpenRouter slug used "
+            "by the native_pdf arm only; SurfSense answers with "
+            "--provider-model. Ignored for head-to-head / symmetric-cheap."
+        ),
+    )
+    p_setup.add_argument(
+        "--no-vision-llm-setup",
+        action="store_true",
+        help=(
+            "Skip attaching a vision LLM config to the SearchSpace even if "
+            "the scenario would normally require one. Use when you want to "
+            "keep whatever is already attached (e.g. a per-user config)."
+        ),
+    )
+    p_setup.set_defaults(_func=_cmd_setup, _async=True)
+
+    p_teardown = sub.add_parser("teardown", help="Soft-delete the suite SearchSpace + clear state slot.")
+    p_teardown.add_argument("--suite", required=True)
+    p_teardown.set_defaults(_func=_cmd_teardown, _async=True)
+
+    p_models = sub.add_parser("models", help="LLM-config discovery helpers.")
+    models_sub = p_models.add_subparsers(dest="subcommand", required=True)
+    p_models_list = models_sub.add_parser("list", help="List global LLM configs.")
+    p_models_list.add_argument("--provider", default=None, help="Filter by provider, e.g. openrouter")
+    p_models_list.add_argument("--grep", default=None, help="Substring filter on name / model_name.")
+    p_models_list.set_defaults(_func=_cmd_models_list, _async=True)
+
+    p_suites = sub.add_parser("suites", help="List registered suites.")
+    suites_sub = p_suites.add_subparsers(dest="subcommand", required=True)
+    p_suites_list = suites_sub.add_parser("list", help="List suites.")
+    p_suites_list.set_defaults(_func=_cmd_suites_list, _async=False)
+
+    p_benchmarks = sub.add_parser("benchmarks", help="List registered benchmarks.")
+    bench_sub = p_benchmarks.add_subparsers(dest="subcommand", required=True)
+    p_bench_list = bench_sub.add_parser("list", help="List benchmarks.")
+    p_bench_list.add_argument("--suite", default=None)
+    p_bench_list.set_defaults(_func=_cmd_benchmarks_list, _async=False)
+
+    # Dynamic ingest / run subcommands need the registry populated, so
+    # discover up-front (cheap on import — modules just register).
+    _discover_suites()
+
+    p_ingest = sub.add_parser("ingest", help="Ingest a benchmark's corpus.")
+    ingest_sub = p_ingest.add_subparsers(dest="suite", required=True)
+    for suite in registry.list_suites():
+        suite_parser = ingest_sub.add_parser(suite, help=f"Ingest a {suite} benchmark.")
+        suite_bench = suite_parser.add_subparsers(dest="benchmark", required=True)
+        for benchmark in registry.list_benchmarks(suite):
+            bp = suite_bench.add_parser(benchmark.name, help=getattr(benchmark, "description", benchmark.name))
+            if hasattr(benchmark, "add_run_args"):
+                benchmark.add_run_args(bp)
+            bp.set_defaults(_func=_cmd_ingest, _async=True)
+
+    p_run = sub.add_parser("run", help="Run a benchmark.")
+    run_sub = p_run.add_subparsers(dest="suite", required=True)
+    for suite in registry.list_suites():
+        suite_parser = run_sub.add_parser(suite, help=f"Run a {suite} benchmark.")
+        suite_bench = suite_parser.add_subparsers(dest="benchmark", required=True)
+        for benchmark in registry.list_benchmarks(suite):
+            bp = suite_bench.add_parser(benchmark.name, help=getattr(benchmark, "description", benchmark.name))
+            if hasattr(benchmark, "add_run_args"):
+                benchmark.add_run_args(bp)
+            bp.set_defaults(_func=_cmd_run, _async=True)
+
+    p_report = sub.add_parser("report", help="Aggregate latest run artifacts into a summary.")
+    p_report.add_argument("--suite", required=True)
+    p_report.add_argument("--benchmark", default=None, help="Optional: report only this benchmark.")
+    p_report.set_defaults(_func=_cmd_report, _async=True)
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+    func = getattr(args, "_func", None)
+    if func is None:
+        parser.print_help()
+        return 2
+    is_async = getattr(args, "_async", False)
+    try:
+        if is_async:
+            return asyncio.run(func(args))
+        return func(args)
+    except KeyboardInterrupt:
+        console.print("[yellow]Interrupted.[/yellow]")
+        return 130
+    except Exception as exc:  # noqa: BLE001
+        logger.exception("CLI command failed")
+        console.print(f"[red]Command failed: {exc}[/red]")
+        return 1
+
+
+if __name__ == "__main__":  # pragma: no cover
+    sys.exit(main())
--- a/surfsense_evals/src/surfsense_evals/core/clients/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/clients/init.py
@ -0,0 +1,14 @@
+"""HTTP clients for the SurfSense API. All share one ``httpx.AsyncClient``."""
+
+from __future__ import annotations
+
+from .documents import DocumentsClient
+from .new_chat import NewChatClient, StreamedAnswer
+from .search_space import SearchSpaceClient
+
+__all__ = [
+    "DocumentsClient",
+    "NewChatClient",
+    "SearchSpaceClient",
+    "StreamedAnswer",
+]
--- a/surfsense_evals/src/surfsense_evals/core/clients/documents.py
+++ b/surfsense_evals/src/surfsense_evals/core/clients/documents.py
@ -0,0 +1,277 @@
+"""Client for ``/api/v1/documents/{fileupload,status,{id}/chunks}``.
+
+Verified against:
+
+* ``surfsense_backend/app/routes/documents_routes.py:122-292`` (POST fileupload)
+* ``surfsense_backend/app/routes/documents_routes.py:806-871`` (GET status batch)
+* ``surfsense_backend/app/routes/documents_routes.py:1062-1128`` (GET {id}/chunks paginated)
+
+Document processing is asynchronous:
+* ``POST /documents/fileupload`` returns immediately with
+  ``document_ids`` in ``pending``;
+* a Celery worker moves each through ``processing → ready/failed``;
+* the harness polls ``GET /documents/status?document_ids=...`` until
+  every doc is ``ready`` (otherwise the retriever sees an empty corpus
+  and accuracy numbers are meaningless).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import mimetypes
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class FileUploadResult:
+    """Mirrors the JSON returned by ``POST /documents/fileupload``."""
+
+    document_ids: list[int]
+    duplicate_document_ids: list[int]
+    total_files: int
+    pending_files: int
+    skipped_duplicates: int
+    message: str = ""
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> FileUploadResult:
+        return cls(
+            document_ids=[int(x) for x in payload.get("document_ids", [])],
+            duplicate_document_ids=[int(x) for x in payload.get("duplicate_document_ids", [])],
+            total_files=int(payload.get("total_files", 0)),
+            pending_files=int(payload.get("pending_files", 0)),
+            skipped_duplicates=int(payload.get("skipped_duplicates", 0)),
+            message=str(payload.get("message", "")),
+        )
+
+
+@dataclass
+class DocumentStatus:
+    document_id: int
+    title: str
+    document_type: str
+    state: str
+    reason: str | None = None
+
+    @property
+    def is_ready(self) -> bool:
+        return self.state == "ready"
+
+    @property
+    def is_failed(self) -> bool:
+        return self.state == "failed"
+
+
+@dataclass
+class ChunkRow:
+    id: int
+    document_id: int
+    content: str = ""
+    raw: dict[str, Any] = field(default_factory=dict)
+
+
+class DocumentProcessingFailed(RuntimeError):
+    """Raised when a polled document lands in ``failed``."""
+
+    def __init__(self, statuses: Sequence[DocumentStatus]) -> None:
+        details = ", ".join(
+            f"id={s.document_id} ({s.title!r}): {s.reason or 'unknown'}"
+            for s in statuses
+        )
+        super().__init__(f"Document(s) failed to process: {details}")
+        self.statuses = list(statuses)
+
+
+class DocumentProcessingTimeout(RuntimeError):
+    """Raised when polling exceeds the per-doc timeout budget."""
+
+
+class DocumentsClient:
+    """Document upload + status polling + chunk listing."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    # ------------------------------------------------------------------
+    # upload
+    # ------------------------------------------------------------------
+
+    async def upload(
+        self,
+        files: Iterable[Path],
+        *,
+        search_space_id: int,
+        should_summarize: bool = False,
+        use_vision_llm: bool = False,
+        processing_mode: str = "basic",
+    ) -> FileUploadResult:
+        """Upload files to ``/api/v1/documents/fileupload``.
+
+        ``files`` is materialised to a list because we may need to
+        re-read on retry. Caller is responsible for ensuring each path
+        exists and respects the per-file size cap (50 MB backend default).
+        """
+
+        materialised = [Path(p) for p in files]
+        if not materialised:
+            return FileUploadResult(
+                document_ids=[],
+                duplicate_document_ids=[],
+                total_files=0,
+                pending_files=0,
+                skipped_duplicates=0,
+                message="No files supplied",
+            )
+
+        opened: list[tuple[str, Any]] = []
+        try:
+            for path in materialised:
+                # ``open`` directly — httpx wraps it in MultipartStream.
+                file_obj = path.open("rb")
+                mime, _ = mimetypes.guess_type(path.name)
+                opened.append(
+                    (
+                        "files",
+                        (path.name, file_obj, mime or "application/octet-stream"),
+                    )
+                )
+
+            response = await self._http.post(
+                f"{self._base}/api/v1/documents/fileupload",
+                data={
+                    "search_space_id": str(search_space_id),
+                    "should_summarize": "true" if should_summarize else "false",
+                    "use_vision_llm": "true" if use_vision_llm else "false",
+                    "processing_mode": processing_mode,
+                },
+                files=opened,
+                # Multipart uploads can be slow for big PDFs; bump per-call.
+                timeout=httpx.Timeout(120.0, connect=10.0),
+            )
+        finally:
+            for _, (_, file_obj, _) in opened:
+                try:
+                    file_obj.close()
+                except Exception:  # noqa: BLE001
+                    pass
+
+        response.raise_for_status()
+        return FileUploadResult.from_payload(response.json())
+
+    # ------------------------------------------------------------------
+    # status polling
+    # ------------------------------------------------------------------
+
+    async def get_status(
+        self, *, search_space_id: int, document_ids: Sequence[int]
+    ) -> list[DocumentStatus]:
+        if not document_ids:
+            return []
+        response = await self._http.get(
+            f"{self._base}/api/v1/documents/status",
+            params={
+                "search_space_id": search_space_id,
+                "document_ids": ",".join(str(d) for d in document_ids),
+            },
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        return [
+            DocumentStatus(
+                document_id=int(item["id"]),
+                title=str(item.get("title", "")),
+                document_type=str(item.get("document_type", "")),
+                state=str((item.get("status") or {}).get("state", "ready")),
+                reason=(item.get("status") or {}).get("reason"),
+            )
+            for item in payload.get("items", [])
+        ]
+
+    async def wait_until_ready(
+        self,
+        *,
+        search_space_id: int,
+        document_ids: Sequence[int],
+        timeout_s: float = 300.0,
+        initial_poll_s: float = 1.0,
+        max_poll_s: float = 10.0,
+    ) -> list[DocumentStatus]:
+        """Poll ``GET /documents/status`` until every doc is ``ready``.
+
+        Exponential backoff from ``initial_poll_s`` up to ``max_poll_s``.
+        Raises ``DocumentProcessingFailed`` if any doc lands in
+        ``failed`` (with the offending document ids), or
+        ``DocumentProcessingTimeout`` if the budget is exhausted.
+        """
+
+        if not document_ids:
+            return []
+        deadline = asyncio.get_event_loop().time() + timeout_s
+        poll = initial_poll_s
+        while True:
+            statuses = await self.get_status(
+                search_space_id=search_space_id, document_ids=document_ids
+            )
+            failed = [s for s in statuses if s.is_failed]
+            if failed:
+                raise DocumentProcessingFailed(failed)
+            ready = [s for s in statuses if s.is_ready]
+            if len(ready) == len(document_ids):
+                return statuses
+            now = asyncio.get_event_loop().time()
+            if now >= deadline:
+                pending = [s for s in statuses if not s.is_ready and not s.is_failed]
+                pending_ids = [s.document_id for s in pending]
+                raise DocumentProcessingTimeout(
+                    f"Timed out after {timeout_s:.0f}s waiting for documents "
+                    f"(still pending/processing: {pending_ids})"
+                )
+            await asyncio.sleep(min(poll, max(0.1, deadline - now)))
+            poll = min(poll * 1.5, max_poll_s)
+
+    # ------------------------------------------------------------------
+    # chunks (chunk_id -> document_id map)
+    # ------------------------------------------------------------------
+
+    async def list_chunks(
+        self, document_id: int, *, page_size: int = 100
+    ) -> list[ChunkRow]:
+        """Walk ``GET /documents/{id}/chunks`` until ``has_more=False``.
+
+        Used by ingestion to materialise the ``chunk_id -> document_id``
+        map needed for retrieval scoring (CUREv1).
+        """
+
+        rows: list[ChunkRow] = []
+        page = 0
+        while True:
+            response = await self._http.get(
+                f"{self._base}/api/v1/documents/{document_id}/chunks",
+                params={"page": page, "page_size": page_size},
+                headers={"Accept": "application/json"},
+            )
+            response.raise_for_status()
+            payload = response.json()
+            for item in payload.get("items", []):
+                rows.append(
+                    ChunkRow(
+                        id=int(item["id"]),
+                        document_id=document_id,
+                        content=str(item.get("content", "")),
+                        raw=item,
+                    )
+                )
+            if not payload.get("has_more"):
+                break
+            page += 1
+        return rows
--- a/surfsense_evals/src/surfsense_evals/core/clients/new_chat.py
+++ b/surfsense_evals/src/surfsense_evals/core/clients/new_chat.py
@ -0,0 +1,280 @@
+"""Client for ``/api/v1/threads`` and ``/api/v1/new_chat`` (SSE).
+
+Verified against:
+
+* ``surfsense_backend/app/routes/new_chat_routes.py:793-848`` (POST /threads)
+* ``surfsense_backend/app/routes/new_chat_routes.py:1073-1142`` (DELETE /threads/{id})
+* ``surfsense_backend/app/routes/new_chat_routes.py:1689-1800`` (POST /new_chat SSE)
+* ``surfsense_backend/app/routes/new_chat_routes.py:191-220`` (THREAD_BUSY / TURN_CANCELLING 409)
+* ``surfsense_backend/app/services/streaming/envelope/sse.py`` (wire framing)
+* ``surfsense_backend/app/services/streaming/events/text.py`` (text-delta events)
+* ``surfsense_backend/app/schemas/new_chat.py:234-288`` (NewChatRequest body)
+
+The wire format is "Vercel AI SDK"-flavoured SSE with one event per
+``data: <json>\n\n`` block (or the literal ``data: [DONE]\n\n``
+terminator). Text deltas arrive as ``{"type":"text-delta","id":...,"delta":...}``
+events; we accumulate them per ``id`` and emit the final concatenated
+text plus parsed citations.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from collections.abc import AsyncIterator, Sequence
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+
+from ..parse import iter_sse_events, parse_citations
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StreamedAnswer:
+    """Result of a single ``/new_chat`` turn."""
+
+    text: str
+    raw_events: list[dict[str, Any]] = field(default_factory=list)
+    latency_ms: int = 0
+    user_message_id: str | None = None
+    assistant_message_id: str | None = None
+    finished_normally: bool = False
+
+    @property
+    def citations(self) -> list[dict[str, Any]]:
+        """Parsed citation tokens (lazy; small enough to recompute)."""
+
+        return [token.to_dict() for token in parse_citations(self.text)]
+
+
+class ThreadBusyError(RuntimeError):
+    """Raised after exhausting retries on a 409 ``THREAD_BUSY`` / ``TURN_CANCELLING``."""
+
+    def __init__(self, error_code: str, message: str) -> None:
+        super().__init__(f"{error_code}: {message}")
+        self.error_code = error_code
+
+
+class NewChatClient:
+    """Thread create / delete / SSE ask."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    # ------------------------------------------------------------------
+    # threads
+    # ------------------------------------------------------------------
+
+    async def create_thread(
+        self,
+        *,
+        search_space_id: int,
+        title: str = "eval",
+        archived: bool = False,
+        visibility: str = "PRIVATE",
+    ) -> int:
+        response = await self._http.post(
+            f"{self._base}/api/v1/threads",
+            json={
+                "search_space_id": search_space_id,
+                "title": title,
+                "archived": archived,
+                "visibility": visibility,
+            },
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        return int(payload["id"])
+
+    async def delete_thread(self, thread_id: int) -> None:
+        response = await self._http.delete(
+            f"{self._base}/api/v1/threads/{thread_id}",
+            headers={"Accept": "application/json"},
+        )
+        if response.status_code == 404:
+            return  # idempotent
+        response.raise_for_status()
+
+    # ------------------------------------------------------------------
+    # /new_chat SSE
+    # ------------------------------------------------------------------
+
+    async def ask(
+        self,
+        *,
+        thread_id: int,
+        search_space_id: int,
+        user_query: str,
+        mentioned_document_ids: Sequence[int] | None = None,
+        disabled_tools: Sequence[str] | None = None,
+        max_busy_retries: int = 4,
+        timeout_s: float = 600.0,
+    ) -> StreamedAnswer:
+        """Stream a single turn and return the accumulated answer.
+
+        Honours backend ``THREAD_BUSY`` / ``TURN_CANCELLING`` 409
+        responses by sleeping for the ``Retry-After`` header (or the
+        ``retry-after-ms`` header if present) and replaying. Bounded
+        by ``max_busy_retries`` so a stuck thread never blocks the
+        whole run.
+        """
+
+        body: dict[str, Any] = {
+            "chat_id": thread_id,
+            "search_space_id": search_space_id,
+            "user_query": user_query,
+        }
+        if mentioned_document_ids:
+            body["mentioned_document_ids"] = list(mentioned_document_ids)
+        if disabled_tools:
+            body["disabled_tools"] = list(disabled_tools)
+
+        attempt = 0
+        while True:
+            try:
+                return await self._stream_once(body=body, timeout_s=timeout_s)
+            except ThreadBusyError as exc:
+                attempt += 1
+                if attempt > max_busy_retries:
+                    raise
+                # Cap wait at 30s; backend retry hint is exponential anyway.
+                wait = min(30.0, 0.5 * (2 ** attempt))
+                logger.info(
+                    "thread_id=%s busy (%s); retry %d/%d after %.1fs",
+                    thread_id,
+                    exc.error_code,
+                    attempt,
+                    max_busy_retries,
+                    wait,
+                )
+                await asyncio.sleep(wait)
+
+    async def _stream_once(
+        self,
+        *,
+        body: dict[str, Any],
+        timeout_s: float,
+    ) -> StreamedAnswer:
+        # Per-call timeout — the connect should be quick, the read needs
+        # to outlive the longest LLM completion.
+        timeout = httpx.Timeout(timeout_s, connect=10.0)
+        started = time.monotonic()
+        async with self._http.stream(
+            "POST",
+            f"{self._base}/api/v1/new_chat",
+            json=body,
+            headers={"Accept": "text/event-stream"},
+            timeout=timeout,
+        ) as response:
+            if response.status_code == 409:
+                detail = await self._extract_busy_detail(response)
+                raise ThreadBusyError(
+                    error_code=detail.get("errorCode", "THREAD_BUSY"),
+                    message=detail.get("message", "Thread is busy"),
+                )
+            response.raise_for_status()
+            answer = await self._consume_sse(response)
+        answer.latency_ms = int((time.monotonic() - started) * 1000)
+        return answer
+
+    @staticmethod
+    async def _extract_busy_detail(response: httpx.Response) -> dict[str, Any]:
+        try:
+            payload = json.loads(await response.aread())
+        except (json.JSONDecodeError, ValueError):
+            return {"errorCode": "THREAD_BUSY", "message": response.text}
+        if isinstance(payload, dict) and isinstance(payload.get("detail"), dict):
+            return payload["detail"]
+        return payload if isinstance(payload, dict) else {}
+
+    @staticmethod
+    async def _consume_sse(response: httpx.Response) -> StreamedAnswer:
+        """Walk SSE events, accumulate text-delta payloads.
+
+        Backend events of interest:
+
+        * ``{"type": "text-start", "id": ...}``
+        * ``{"type": "text-delta", "id": ..., "delta": ...}``
+        * ``{"type": "text-end", "id": ...}``
+        * ``{"type": "start", "messageId": ...}``  (top-level message id)
+        * ``{"type": "finish"}``
+        * literal ``[DONE]`` sentinel
+
+        Multiple ``text-start`` blocks can interleave — each gets its
+        own ``id`` and we concatenate them in arrival order. That
+        mirrors the AI SDK client behaviour: one continuous assistant
+        message visible to the user.
+        """
+
+        ordered_text_ids: list[str] = []
+        text_buffers: dict[str, list[str]] = {}
+        raw_events: list[dict[str, Any]] = []
+        user_message_id: str | None = None
+        assistant_message_id: str | None = None
+        finished = False
+
+        async for event in iter_sse_events(_aiter_lines(response)):
+            data = event.data
+            if data == "[DONE]":
+                finished = True
+                continue
+            try:
+                payload = json.loads(data)
+            except (json.JSONDecodeError, ValueError):
+                logger.debug("Skipping non-JSON SSE payload: %r", data[:120])
+                continue
+            if not isinstance(payload, dict):
+                continue
+            raw_events.append(payload)
+            ev_type = payload.get("type")
+            if ev_type == "text-delta":
+                tid = str(payload.get("id", ""))
+                delta = payload.get("delta", "")
+                if not isinstance(delta, str):
+                    continue
+                if tid not in text_buffers:
+                    text_buffers[tid] = []
+                    ordered_text_ids.append(tid)
+                text_buffers[tid].append(delta)
+            elif ev_type == "text-start":
+                tid = str(payload.get("id", ""))
+                if tid and tid not in text_buffers:
+                    text_buffers[tid] = []
+                    ordered_text_ids.append(tid)
+            elif ev_type == "start":
+                msg_id = payload.get("messageId")
+                if isinstance(msg_id, str):
+                    user_message_id = user_message_id or msg_id
+            elif ev_type == "data-user-message-id":
+                msg_id = (payload.get("data") or {}).get("id") or payload.get("id")
+                if isinstance(msg_id, str):
+                    user_message_id = msg_id
+            elif ev_type == "data-assistant-message-id":
+                msg_id = (payload.get("data") or {}).get("id") or payload.get("id")
+                if isinstance(msg_id, str):
+                    assistant_message_id = msg_id
+            elif ev_type == "finish":
+                finished = True
+
+        text = "".join("".join(text_buffers.get(tid, [])) for tid in ordered_text_ids)
+        return StreamedAnswer(
+            text=text,
+            raw_events=raw_events,
+            user_message_id=user_message_id,
+            assistant_message_id=assistant_message_id,
+            finished_normally=finished,
+        )
+
+
+async def _aiter_lines(response: httpx.Response) -> AsyncIterator[str]:
+    """Adapter so the parser can consume any line iterator (mockable in tests)."""
+
+    async for line in response.aiter_lines():
+        yield line
--- a/surfsense_evals/src/surfsense_evals/core/clients/search_space.py
+++ b/surfsense_evals/src/surfsense_evals/core/clients/search_space.py
@ -0,0 +1,207 @@
+"""Client for ``/api/v1/searchspaces`` and ``/api/v1/search-spaces/{id}/llm-preferences``.
+
+Verified against:
+
+* ``surfsense_backend/app/routes/search_spaces_routes.py:116`` (POST create)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:234`` (GET by id)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:422`` (DELETE soft-delete)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:698-849`` (GET/PUT llm-preferences)
+* ``surfsense_backend/app/schemas/search_space.py:14`` (SearchSpaceCreate body)
+* ``surfsense_backend/app/routes/vision_llm_routes.py:60`` (GET global vision configs)
+
+Note the inconsistent pluralisation in the backend: ``/searchspaces``
+(no hyphen) for CRUD, but ``/search-spaces`` (hyphenated) for the
+``llm-preferences`` sub-resource. Both are mirrored verbatim here.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+import httpx
+
+
+@dataclass
+class SearchSpaceRow:
+    """Subset of the SearchSpace row we care about."""
+
+    id: int
+    name: str
+    description: str | None
+    user_id: str
+    citations_enabled: bool
+    qna_custom_instructions: str | None
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> SearchSpaceRow:
+        return cls(
+            id=int(payload["id"]),
+            name=str(payload["name"]),
+            description=payload.get("description"),
+            user_id=str(payload.get("user_id", "")),
+            citations_enabled=bool(payload.get("citations_enabled", True)),
+            qna_custom_instructions=payload.get("qna_custom_instructions"),
+        )
+
+
+@dataclass
+class VisionLlmConfigEntry:
+    """Subset of one ``GET /global-vision-llm-configs`` row.
+
+    The backend returns negative ids for global / OpenRouter-derived
+    vision configs and positive ids for per-user BYOK rows. Either is
+    accepted by ``set_llm_preferences(vision_llm_config_id=...)``.
+    """
+
+    id: int
+    name: str
+    provider: str
+    model_name: str
+    is_auto_mode: bool
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> VisionLlmConfigEntry:
+        return cls(
+            id=int(payload.get("id", 0)),
+            name=str(payload.get("name", "")),
+            provider=str(payload.get("provider", "")).upper(),
+            model_name=str(payload.get("model_name", "")),
+            is_auto_mode=bool(payload.get("is_auto_mode", False)),
+            raw=payload,
+        )
+
+
+@dataclass
+class LlmPreferences:
+    """Resolved LLM preferences with the embedded full config row.
+
+    Mirrors ``LLMPreferencesRead`` from the backend so the lifecycle
+    command can introspect ``provider`` / ``model_name`` to validate the
+    OpenRouter pin.
+    """
+
+    agent_llm_id: int | None
+    document_summary_llm_id: int | None
+    image_generation_config_id: int | None
+    vision_llm_config_id: int | None
+    agent_llm: dict[str, Any] | None
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> LlmPreferences:
+        return cls(
+            agent_llm_id=payload.get("agent_llm_id"),
+            document_summary_llm_id=payload.get("document_summary_llm_id"),
+            image_generation_config_id=payload.get("image_generation_config_id"),
+            vision_llm_config_id=payload.get("vision_llm_config_id"),
+            agent_llm=payload.get("agent_llm"),
+            raw=payload,
+        )
+
+
+class SearchSpaceClient:
+    """Thin wrapper around the SearchSpace + LLM preferences endpoints."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    async def create(self, name: str, *, description: str | None = None) -> SearchSpaceRow:
+        body: dict[str, Any] = {"name": name}
+        if description is not None:
+            body["description"] = description
+        # citations_enabled defaults to True backend-side; keep that default.
+        response = await self._http.post(
+            f"{self._base}/api/v1/searchspaces",
+            json=body,
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return SearchSpaceRow.from_payload(response.json())
+
+    async def get(self, search_space_id: int) -> SearchSpaceRow:
+        response = await self._http.get(
+            f"{self._base}/api/v1/searchspaces/{search_space_id}",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return SearchSpaceRow.from_payload(response.json())
+
+    async def delete(self, search_space_id: int) -> None:
+        """Soft-delete: backend prefixes name with ``[DELETING]`` and dispatches a Celery cascade."""
+
+        response = await self._http.delete(
+            f"{self._base}/api/v1/searchspaces/{search_space_id}",
+            headers={"Accept": "application/json"},
+        )
+        # 404 means it's already gone — treat as success (idempotent teardown).
+        if response.status_code == 404:
+            return
+        response.raise_for_status()
+
+    async def get_llm_preferences(self, search_space_id: int) -> LlmPreferences:
+        response = await self._http.get(
+            f"{self._base}/api/v1/search-spaces/{search_space_id}/llm-preferences",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return LlmPreferences.from_payload(response.json())
+
+    async def set_llm_preferences(
+        self,
+        search_space_id: int,
+        *,
+        agent_llm_id: int | None = None,
+        document_summary_llm_id: int | None = None,
+        image_generation_config_id: int | None = None,
+        vision_llm_config_id: int | None = None,
+    ) -> LlmPreferences:
+        """PUT a partial update to ``/search-spaces/{id}/llm-preferences``.
+
+        Backend uses ``model_dump(exclude_unset=True)`` so omitted fields
+        are left unchanged.
+        """
+
+        body: dict[str, Any] = {}
+        if agent_llm_id is not None:
+            body["agent_llm_id"] = agent_llm_id
+        if document_summary_llm_id is not None:
+            body["document_summary_llm_id"] = document_summary_llm_id
+        if image_generation_config_id is not None:
+            body["image_generation_config_id"] = image_generation_config_id
+        if vision_llm_config_id is not None:
+            body["vision_llm_config_id"] = vision_llm_config_id
+        response = await self._http.put(
+            f"{self._base}/api/v1/search-spaces/{search_space_id}/llm-preferences",
+            json=body,
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return LlmPreferences.from_payload(response.json())
+
+    async def list_global_vision_llm_configs(self) -> list[VisionLlmConfigEntry]:
+        """List the registered global vision LLM configs.
+
+        Used by ``setup`` to (a) resolve an explicit ``--vision-llm <slug>``
+        to a config id and (b) auto-pick the strongest registered vision
+        config when the operator doesn't pass one. The ``Auto (Fastest)``
+        entry (``id=0``) is filtered out — accuracy must be reproducible.
+        """
+
+        response = await self._http.get(
+            f"{self._base}/api/v1/global-vision-llm-configs",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        if not isinstance(payload, list):
+            raise RuntimeError(
+                f"Unexpected /global-vision-llm-configs payload: {payload!r}"
+            )
+        return [
+            VisionLlmConfigEntry.from_payload(item)
+            for item in payload
+            if not bool(item.get("is_auto_mode", False))
+        ]
--- a/surfsense_evals/src/surfsense_evals/core/config.py
+++ b/surfsense_evals/src/surfsense_evals/core/config.py
@ -0,0 +1,279 @@
+"""Environment + filesystem configuration for the harness.
+
+Two responsibilities:
+
+1. Load env vars (with sensible defaults) into a single immutable ``Config``
+   so that every other module reads it from one place.
+2. Read / write ``data/state.json``. State is keyed by suite name so multiple
+   suites can be set up in parallel and torn down independently.
+
+The pinned ``search_space_id`` lives in ``state.json`` (not env) so re-runs
+are idempotent without forcing the operator to remember an integer.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from dotenv import load_dotenv
+
+# Resolve once at import time. ``find_dotenv`` walks up; an explicit ``.env``
+# at the package root or in CWD wins. Silent-no-op if neither exists.
+load_dotenv()
+
+
+_PROJECT_ROOT = Path(__file__).resolve().parents[3]
+"""Resolves to ``surfsense_evals/`` (the package root, not ``src/``)."""
+
+
+def _project_root() -> Path:
+    """Return the ``surfsense_evals/`` project root.
+
+    Computed from this file's path: ``src/surfsense_evals/core/config.py`` →
+    walk up four levels. Kept as a function so tests can monkeypatch.
+    """
+
+    return _PROJECT_ROOT
+
+
+@dataclass(frozen=True)
+class Config:
+    """Immutable runtime configuration."""
+
+    surfsense_api_base: str
+    openrouter_api_key: str | None
+    openrouter_base_url: str
+
+    # Credentials — exactly ONE mode must be supplied.
+    surfsense_jwt: str | None
+    surfsense_refresh_token: str | None
+    surfsense_user_email: str | None
+    surfsense_user_password: str | None
+
+    # Filesystem paths.
+    data_dir: Path
+    reports_dir: Path
+
+    @property
+    def state_path(self) -> Path:
+        return self.data_dir / "state.json"
+
+    def has_jwt_mode(self) -> bool:
+        return bool(self.surfsense_jwt)
+
+    def has_local_mode(self) -> bool:
+        return bool(self.surfsense_user_email and self.surfsense_user_password)
+
+    def credential_mode(self) -> str:
+        """Return ``"jwt"``, ``"local"``, or ``"none"`` (no credentials supplied)."""
+
+        if self.has_jwt_mode():
+            return "jwt"
+        if self.has_local_mode():
+            return "local"
+        return "none"
+
+    def suite_data_dir(self, suite: str) -> Path:
+        return self.data_dir / suite
+
+    def suite_reports_dir(self, suite: str) -> Path:
+        return self.reports_dir / suite
+
+    def suite_runs_dir(self, suite: str) -> Path:
+        return self.suite_data_dir(suite) / "runs"
+
+    def suite_maps_dir(self, suite: str) -> Path:
+        return self.suite_data_dir(suite) / "maps"
+
+
+def load_config() -> Config:
+    """Read the current process env into a ``Config``.
+
+    No validation is performed here; callers (e.g. ``auth.acquire_token``,
+    ``cli`` subcommands) decide which fields they require. This keeps
+    ``models list`` and ``suites list`` runnable without OpenRouter creds.
+    """
+
+    project_root = _project_root()
+    data_dir = Path(os.environ.get("EVAL_DATA_DIR") or (project_root / "data")).resolve()
+    reports_dir = Path(os.environ.get("EVAL_REPORTS_DIR") or (project_root / "reports")).resolve()
+    return Config(
+        surfsense_api_base=os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/"),
+        openrouter_api_key=os.environ.get("OPENROUTER_API_KEY") or None,
+        openrouter_base_url=os.environ.get(
+            "OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
+        ).rstrip("/"),
+        surfsense_jwt=os.environ.get("SURFSENSE_JWT") or None,
+        surfsense_refresh_token=os.environ.get("SURFSENSE_REFRESH_TOKEN") or None,
+        surfsense_user_email=os.environ.get("SURFSENSE_USER_EMAIL") or None,
+        surfsense_user_password=os.environ.get("SURFSENSE_USER_PASSWORD") or None,
+        data_dir=data_dir,
+        reports_dir=reports_dir,
+    )
+
+
+# ---------------------------------------------------------------------------
+# state.json — per-suite slots
+# ---------------------------------------------------------------------------
+
+
+# Scenario names — chosen at ``setup`` time, persisted in ``state.json``.
+#
+# * ``head-to-head`` (default, current behaviour): both arms answer with the
+#   SAME slug pinned via ``--provider-model``. Vision LLM at ingest is
+#   optional but recommended for image-bearing benchmarks.
+# * ``symmetric-cheap``: both arms answer with the SAME (cheap, text-only)
+#   slug; SurfSense pre-extracted images at ingest with a vision LLM.
+#   Measures whether vision-RAG ingestion lets a cheap downstream model
+#   match a vision one. Native arm structurally loses on image questions —
+#   that's the point, and the report labels it accordingly.
+# * ``cost-arbitrage``: native arm answers with an EXPENSIVE vision slug
+#   (``--native-arm-model``), SurfSense answers with a CHEAP text-only slug
+#   (``--provider-model``) over chunks the vision LLM already extracted at
+#   ingest. Measures how close SurfSense gets to native at a fraction of
+#   the per-query cost. The most compelling "shines" framing.
+SCENARIOS: tuple[str, ...] = ("head-to-head", "symmetric-cheap", "cost-arbitrage")
+DEFAULT_SCENARIO: str = "head-to-head"
+
+
+@dataclass
+class SuiteState:
+    """Per-suite persisted state.
+
+    ``provider_model`` is the slug pinned to the SearchSpace's
+    ``agent_llm`` — what answers SurfSense queries (and what the native
+    arm uses too, unless ``native_arm_model`` is set for cost-arbitrage).
+
+    ``vision_provider_model`` is the slug of the OpenRouter vision LLM
+    config attached to the SearchSpace's ``vision_llm_config_id`` — what
+    SurfSense uses to extract image content at ingest time when
+    ``use_vision_llm=True``. ``None`` means no vision config was attached
+    at setup (legacy or text-only suite).
+    """
+
+    search_space_id: int
+    agent_llm_id: int
+    provider_model: str
+    created_at: str
+    ingestion_maps: dict[str, str] = field(default_factory=dict)
+    scenario: str = DEFAULT_SCENARIO
+    vision_llm_config_id: int | None = None
+    vision_provider_model: str | None = None
+    native_arm_model: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "search_space_id": self.search_space_id,
+            "agent_llm_id": self.agent_llm_id,
+            "provider_model": self.provider_model,
+            "created_at": self.created_at,
+            "ingestion_maps": dict(self.ingestion_maps),
+            "scenario": self.scenario,
+            "vision_llm_config_id": self.vision_llm_config_id,
+            "vision_provider_model": self.vision_provider_model,
+            "native_arm_model": self.native_arm_model,
+        }
+
+    @classmethod
+    def from_dict(cls, payload: Mapping[str, Any]) -> SuiteState:
+        # ``scenario`` / vision / native fields default for back-compat with
+        # ``state.json`` written before scenarios shipped.
+        scenario = str(payload.get("scenario") or DEFAULT_SCENARIO)
+        if scenario not in SCENARIOS:
+            scenario = DEFAULT_SCENARIO
+        raw_vision_id = payload.get("vision_llm_config_id")
+        return cls(
+            search_space_id=int(payload["search_space_id"]),
+            agent_llm_id=int(payload["agent_llm_id"]),
+            provider_model=str(payload["provider_model"]),
+            created_at=str(payload.get("created_at") or ""),
+            ingestion_maps=dict(payload.get("ingestion_maps") or {}),
+            scenario=scenario,
+            vision_llm_config_id=int(raw_vision_id) if raw_vision_id is not None else None,
+            vision_provider_model=(
+                str(payload["vision_provider_model"])
+                if payload.get("vision_provider_model")
+                else None
+            ),
+            native_arm_model=(
+                str(payload["native_arm_model"])
+                if payload.get("native_arm_model")
+                else None
+            ),
+        )
+
+    @property
+    def effective_native_arm_model(self) -> str:
+        """Slug the native arm should use; falls back to ``provider_model``."""
+
+        return self.native_arm_model or self.provider_model
+
+
+def _load_state(config: Config) -> dict[str, Any]:
+    if not config.state_path.exists():
+        return {"suites": {}}
+    try:
+        with config.state_path.open("r", encoding="utf-8") as fh:
+            data = json.load(fh)
+    except (OSError, json.JSONDecodeError) as exc:
+        raise RuntimeError(
+            f"Failed to read state file {config.state_path}: {exc!s}. "
+            "Delete it if you want to start fresh."
+        ) from exc
+    if not isinstance(data, dict) or "suites" not in data:
+        return {"suites": {}}
+    return data
+
+
+def _write_state(config: Config, payload: Mapping[str, Any]) -> None:
+    config.data_dir.mkdir(parents=True, exist_ok=True)
+    tmp = config.state_path.with_suffix(".json.tmp")
+    with tmp.open("w", encoding="utf-8") as fh:
+        json.dump(dict(payload), fh, indent=2, sort_keys=True)
+        fh.write("\n")
+    tmp.replace(config.state_path)
+
+
+def get_suite_state(config: Config, suite: str) -> SuiteState | None:
+    """Return ``SuiteState`` for ``suite`` or ``None`` if not set up."""
+
+    state = _load_state(config)
+    raw = (state.get("suites") or {}).get(suite)
+    if not raw:
+        return None
+    return SuiteState.from_dict(raw)
+
+
+def set_suite_state(config: Config, suite: str, suite_state: SuiteState) -> None:
+    """Persist ``suite_state`` under the suite slot. Other suites are untouched."""
+
+    state = _load_state(config)
+    suites = dict(state.get("suites") or {})
+    suites[suite] = suite_state.to_dict()
+    state["suites"] = suites
+    _write_state(config, state)
+
+
+def clear_suite_state(config: Config, suite: str) -> bool:
+    """Remove the slot for ``suite``. Returns ``True`` if removal happened."""
+
+    state = _load_state(config)
+    suites = dict(state.get("suites") or {})
+    if suite not in suites:
+        return False
+    del suites[suite]
+    state["suites"] = suites
+    _write_state(config, state)
+    return True
+
+
+def utc_iso_timestamp() -> str:
+    """Filesystem-safe UTC ISO timestamp, e.g. ``2026-05-11T20-30-00Z``."""
+
+    return datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
--- a/surfsense_evals/src/surfsense_evals/core/ingest_settings.py
+++ b/surfsense_evals/src/surfsense_evals/core/ingest_settings.py
@ -0,0 +1,311 @@
+"""Per-upload ingestion settings shared across every benchmark.
+
+The SurfSense ``POST /api/v1/documents/fileupload`` endpoint exposes
+exactly three knobs (verified at
+``surfsense_backend/app/routes/documents_routes.py`` and
+``surfsense_backend/app/etl_pipeline/etl_document.py``):
+
+* ``processing_mode``     — ``"basic"`` (default) | ``"premium"``
+* ``use_vision_llm``      — ``bool`` (run vision LLM during ingest to
+                            extract image content / captions / tables)
+* ``should_summarize``    — ``bool`` (generate document summary)
+
+This module gives every benchmark a uniform way to:
+
+1. Receive sensible per-benchmark defaults (text-only benchmarks
+   default vision off; image-bearing benchmarks default vision on).
+2. Accept CLI overrides (``--use-vision-llm`` / ``--no-vision-llm``,
+   ``--processing-mode {basic,premium}``,
+   ``--should-summarize`` / ``--no-summarize``).
+3. Persist the *actual* settings used into the doc-map manifest and
+   the run artifact so reports can show "vision=ON, mode=premium →
+   65% accuracy" head-to-head with "vision=OFF, mode=basic → 52%".
+
+A/B testing on the same corpus
+------------------------------
+
+SurfSense dedupes uploads by ``(filename, search_space_id)`` — NOT by
+content hash and NOT by ingestion settings. Re-uploading the same
+filename to the same SearchSpace with a different ``use_vision_llm``
+flag will hit the duplicate branch and *not* re-process. To compare
+two settings combos head-to-head on the same corpus you must give
+each combo its own SearchSpace, which today means:
+
+    teardown --suite <s>
+    setup    --suite <s> ...
+    ingest   <s> <bench>  --no-vision-llm   # baseline run
+    run      <s> <bench>
+    teardown --suite <s>
+    setup    --suite <s> ...
+    ingest   <s> <bench>  --use-vision-llm  # vision arm
+    run      <s> <bench>
+
+The runs land in different timestamped subdirectories under
+``data/<suite>/runs/`` and ``report --suite <s>`` aggregates whichever
+manifest is currently latest per benchmark.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections.abc import Mapping
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+# Keep the constant list of valid processing modes here so benchmarks
+# don't have to re-import from the backend (they don't have access to
+# the backend package anyway).
+PROCESSING_MODES: tuple[str, ...] = ("basic", "premium")
+
+
+@dataclass(frozen=True)
+class IngestSettings:
+    """Resolved per-upload knobs handed to ``DocumentsClient.upload``.
+
+    Use ``IngestSettings(...)`` directly to define benchmark defaults,
+    or ``IngestSettings.merge(defaults, opts)`` to apply CLI overrides
+    on top of those defaults.
+    """
+
+    use_vision_llm: bool = False
+    processing_mode: str = "basic"
+    should_summarize: bool = False
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "use_vision_llm": self.use_vision_llm,
+            "processing_mode": self.processing_mode,
+            "should_summarize": self.should_summarize,
+        }
+
+    @classmethod
+    def merge(cls, defaults: IngestSettings, opts: Mapping[str, Any]) -> IngestSettings:
+        """Apply CLI overrides on top of ``defaults``.
+
+        ``opts`` is the kwargs dict built by ``core.cli`` from the
+        argparse namespace (see ``_cmd_ingest`` / ``_cmd_run``). Keys
+        we look for: ``use_vision_llm`` (bool or None), ``processing_mode``
+        (str or None), ``should_summarize`` (bool or None). Anything
+        else is ignored so benchmarks can pass through their own opts.
+        """
+
+        return cls(
+            use_vision_llm=_coerce_bool(opts.get("use_vision_llm"), defaults.use_vision_llm),
+            processing_mode=_coerce_mode(opts.get("processing_mode"), defaults.processing_mode),
+            should_summarize=_coerce_bool(opts.get("should_summarize"), defaults.should_summarize),
+        )
+
+    def render_label(self) -> str:
+        """Human-readable single-line label for reports / log lines."""
+
+        return (
+            f"vision={'on' if self.use_vision_llm else 'off'}, "
+            f"mode={self.processing_mode}, "
+            f"summarize={'on' if self.should_summarize else 'off'}"
+        )
+
+
+def _coerce_bool(value: Any, default: bool) -> bool:
+    """Argparse with ``BooleanOptionalAction`` yields True/False/None.
+
+    ``None`` means the operator didn't pass the flag → fall back to
+    the benchmark default.
+    """
+
+    if value is None:
+        return default
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
+def _coerce_mode(value: Any, default: str) -> str:
+    if value is None or value == "":
+        return default
+    val = str(value).strip().lower()
+    if val not in PROCESSING_MODES:
+        raise ValueError(
+            f"Invalid processing_mode {val!r}; must be one of {PROCESSING_MODES}"
+        )
+    return val
+
+
+# ---------------------------------------------------------------------------
+# Argparse helper
+# ---------------------------------------------------------------------------
+
+
+def _add_bool_pair(
+    parser: argparse.ArgumentParser,
+    *,
+    dest: str,
+    on_flag: str,
+    off_flag: str,
+    on_help: str,
+    off_help: str,
+) -> None:
+    """Add a mutually exclusive ``--foo`` / ``--no-foo`` pair.
+
+    We don't use ``argparse.BooleanOptionalAction`` because it would
+    auto-generate ``--no-use-vision-llm`` rather than the friendlier
+    ``--no-vision-llm`` that operators reach for. Default is ``None``
+    so ``IngestSettings.merge`` can distinguish "silent" from
+    "explicit false".
+    """
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        on_flag,
+        dest=dest,
+        action="store_true",
+        default=None,
+        help=on_help,
+    )
+    group.add_argument(
+        off_flag,
+        dest=dest,
+        action="store_false",
+        default=None,
+        help=off_help,
+    )
+
+
+def add_ingest_settings_args(
+    parser: argparse.ArgumentParser,
+    *,
+    defaults: IngestSettings,
+) -> None:
+    """Attach the three ingest-settings flag pairs to ``parser``.
+
+    Each bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
+    pair so an operator can flip either direction without restating
+    every flag. Default is ``None`` so that "operator didn't pass the
+    flag" is distinguishable from "operator explicitly passed false"
+    — ``IngestSettings.merge`` then folds in the benchmark default
+    only when the operator was silent.
+    """
+
+    settings_group = parser.add_argument_group(
+        "ingest settings",
+        f"Per-upload knobs (forwarded to /documents/fileupload). "
+        f"Defaults for this benchmark: {defaults.render_label()}.",
+    )
+    _add_bool_pair(
+        settings_group,
+        dest="use_vision_llm",
+        on_flag="--use-vision-llm",
+        off_flag="--no-vision-llm",
+        on_help=(
+            "Run vision LLM during ingest to extract image content "
+            f"(default for this benchmark: "
+            f"{'on' if defaults.use_vision_llm else 'off'})."
+        ),
+        off_help="Skip vision LLM during ingest (text-only ETL).",
+    )
+    settings_group.add_argument(
+        "--processing-mode",
+        dest="processing_mode",
+        choices=PROCESSING_MODES,
+        default=None,
+        help=(
+            "SurfSense ETL processing mode (premium uses a 10x page "
+            f"multiplier and typically routes to a stronger ETL). "
+            f"Default for this benchmark: {defaults.processing_mode!r}."
+        ),
+    )
+    _add_bool_pair(
+        settings_group,
+        dest="should_summarize",
+        on_flag="--should-summarize",
+        off_flag="--no-summarize",
+        on_help=(
+            "Have SurfSense generate a document summary at ingest "
+            f"(default for this benchmark: "
+            f"{'on' if defaults.should_summarize else 'off'})."
+        ),
+        off_help="Skip per-document summary generation.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Doc-map manifest helpers
+# ---------------------------------------------------------------------------
+#
+# Every benchmark writes a doc-map JSONL under ``data/<suite>/maps/`` that
+# pairs source identifiers (case_id, snippet_id, doc_path, …) to the
+# SurfSense document_ids returned by the upload. To make the report
+# self-describing we also write a header line:
+#
+#     {"__settings__": {"use_vision_llm": ..., "processing_mode": ..., ...}}
+#
+# These two helpers centralise that protocol so each benchmark only has to
+# call ``write_settings_header`` and ``read_settings_header``.
+
+SETTINGS_HEADER_KEY = "__settings__"
+
+
+def settings_header_line(settings: IngestSettings) -> str:
+    """Return the JSON-serialised header line (no trailing newline)."""
+
+    return json.dumps({SETTINGS_HEADER_KEY: settings.to_dict()})
+
+
+def is_settings_header(row: Mapping[str, Any]) -> bool:
+    return SETTINGS_HEADER_KEY in row
+
+
+def read_settings_header(map_path: Path) -> dict[str, Any]:
+    """Read the ``__settings__`` header out of a doc-map JSONL.
+
+    Returns ``{}`` on a missing file, an empty file, an unreadable
+    file, or a file whose first non-blank line is not a settings
+    header (e.g. a corpus ingested before this feature existed).
+    Callers use this purely to surface settings in the report; it
+    must never fail the run.
+    """
+
+    if not map_path.exists():
+        return {}
+    try:
+        with map_path.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                row = json.loads(line)
+                if isinstance(row, dict) and SETTINGS_HEADER_KEY in row:
+                    return dict(row[SETTINGS_HEADER_KEY])
+                return {}
+    except (OSError, json.JSONDecodeError):
+        return {}
+    return {}
+
+
+def format_ingest_settings_md(settings: Any) -> str:
+    """Render the resolved settings as a single Markdown bullet line."""
+
+    if not isinstance(settings, Mapping) or not settings:
+        return "- SurfSense ingest settings: (not recorded — re-ingest to capture)"
+    vision = "on" if settings.get("use_vision_llm") else "off"
+    mode = settings.get("processing_mode") or "basic"
+    summarize = "on" if settings.get("should_summarize") else "off"
+    return (
+        f"- SurfSense ingest settings: vision_llm=`{vision}`, "
+        f"processing_mode=`{mode}`, summarize=`{summarize}`"
+    )
+
+
+__all__ = [
+    "PROCESSING_MODES",
+    "SETTINGS_HEADER_KEY",
+    "IngestSettings",
+    "add_ingest_settings_args",
+    "format_ingest_settings_md",
+    "is_settings_header",
+    "read_settings_header",
+    "settings_header_line",
+]
--- a/surfsense_evals/src/surfsense_evals/core/metrics/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/init.py
@ -0,0 +1,50 @@
+"""Pure-function metric primitives. Lazy imports."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .comparison import McnemarResult, bootstrap_delta_ci, mcnemar_test, paired_aggregate
+    from .mc_accuracy import AccuracyResult, accuracy_with_wilson_ci, wilson_ci
+    from .retrieval import RetrievalScores, mrr, ndcg_at_k, recall_at_k, score_run
+
+__all__ = [
+    "AccuracyResult",
+    "McnemarResult",
+    "RetrievalScores",
+    "accuracy_with_wilson_ci",
+    "bootstrap_delta_ci",
+    "mcnemar_test",
+    "mrr",
+    "ndcg_at_k",
+    "paired_aggregate",
+    "recall_at_k",
+    "score_run",
+    "wilson_ci",
+]
+
+
+_MODULE_FOR = {
+    "AccuracyResult": "mc_accuracy",
+    "accuracy_with_wilson_ci": "mc_accuracy",
+    "wilson_ci": "mc_accuracy",
+    "RetrievalScores": "retrieval",
+    "mrr": "retrieval",
+    "ndcg_at_k": "retrieval",
+    "recall_at_k": "retrieval",
+    "score_run": "retrieval",
+    "McnemarResult": "comparison",
+    "bootstrap_delta_ci": "comparison",
+    "mcnemar_test": "comparison",
+    "paired_aggregate": "comparison",
+}
+
+
+def __getattr__(name: str):
+    if name in _MODULE_FOR:
+        from importlib import import_module
+
+        mod = import_module(f".{_MODULE_FOR[name]}", __name__)
+        return getattr(mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.metrics' has no attribute {name!r}")
--- a/surfsense_evals/src/surfsense_evals/core/metrics/comparison.py
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/comparison.py
@ -0,0 +1,258 @@
+"""Paired comparison statistics for head-to-head benchmarks.
+
+In every head-to-head benchmark (currently MedXpertQA-MM and
+MMLongBench-Doc) each question is answered by both arms (Native PDF
+and SurfSense). That makes per-question outcomes paired, so
+``McNemar's test`` on the discordant pairs is the right significance
+test for "are the two arms different?". We also expose a bootstrap
+delta CI for visualising effect size.
+
+Aggregate cost / latency / token deltas are mean-based; the runner
+slices them by arm before passing them in.
+"""
+
+from __future__ import annotations
+
+import math
+import statistics
+from collections.abc import Sequence
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True)
+class McnemarResult:
+    """Discordant pair counts + the test statistics."""
+
+    n_total: int
+    b: int  # native correct, surfsense wrong
+    c: int  # native wrong,   surfsense correct
+    statistic: float
+    p_value: float
+    method: str
+
+    def to_dict(self) -> dict[str, float | int | str]:
+        return {
+            "n_total": self.n_total,
+            "b_native_correct_only": self.b,
+            "c_surfsense_correct_only": self.c,
+            "statistic": self.statistic,
+            "p_value": self.p_value,
+            "method": self.method,
+        }
+
+
+def mcnemar_test(
+    arm_a_correct: Sequence[bool],
+    arm_b_correct: Sequence[bool],
+    *,
+    use_exact_below: int = 11,
+) -> McnemarResult:
+    """Paired McNemar's test on per-question correctness.
+
+    ``arm_a_correct`` is treated as the reference arm (typically the
+    "native" arm); ``arm_b_correct`` is the challenger (typically
+    "surfsense"). The test statistic only depends on discordant pairs.
+
+    Default switch-over (``b + c < 11``): for very small discordant
+    samples the exact binomial test is preferred; above that the
+    continuity-corrected chi-square is well-behaved (Edwards 1948).
+    Callers can raise ``use_exact_below`` if they prefer the more
+    conservative ``b + c < 25`` rule.
+
+    No external statistical package is required: scipy is a heavy dep
+    and we only need binomial CDFs / chi-square sf, both implementable
+    in stdlib + numpy without surprises.
+    """
+
+    if len(arm_a_correct) != len(arm_b_correct):
+        raise ValueError(
+            f"Length mismatch: arm_a={len(arm_a_correct)}, arm_b={len(arm_b_correct)}"
+        )
+    n = len(arm_a_correct)
+    b = sum(1 for a, c in zip(arm_a_correct, arm_b_correct) if a and not c)
+    c = sum(1 for a, cc in zip(arm_a_correct, arm_b_correct) if (not a) and cc)
+    discordant = b + c
+    if discordant == 0:
+        return McnemarResult(
+            n_total=n, b=b, c=c, statistic=0.0, p_value=1.0, method="degenerate"
+        )
+
+    if discordant < use_exact_below:
+        # Exact binomial: under H0 each discordant pair is a Bernoulli(0.5).
+        # p-value = 2 * P(X <= min(b,c) | n=discordant, p=0.5), capped at 1.
+        k = min(b, c)
+        cdf = sum(_binom_pmf(discordant, i) for i in range(k + 1))
+        p_value = min(1.0, 2.0 * cdf)
+        return McnemarResult(
+            n_total=n, b=b, c=c, statistic=float(k), p_value=p_value, method="exact"
+        )
+
+    # Chi-square with continuity correction (McNemar-Edwards).
+    chi = ((abs(b - c) - 1) ** 2) / discordant
+    p_value = _chi2_sf(chi, df=1)
+    return McnemarResult(
+        n_total=n, b=b, c=c, statistic=chi, p_value=p_value, method="chi2_cc"
+    )
+
+
+def _binom_pmf(n: int, k: int) -> float:
+    return math.comb(n, k) * (0.5 ** n)
+
+
+def _chi2_sf(x: float, *, df: int) -> float:
+    """Survival function (1 - CDF) of chi-square; df=1 closed form."""
+
+    if x <= 0:
+        return 1.0
+    if df == 1:
+        # Chi^2(1) = N(0,1)^2; sf(x) = 2 * Phi_complement(sqrt(x))
+        return math.erfc(math.sqrt(x / 2.0))
+    # General fallback via regularized upper incomplete gamma.
+    a = df / 2.0
+    z = x / 2.0
+    return _gammaincc(a, z)
+
+
+def _gammaincc(a: float, x: float, *, max_iter: int = 200, tol: float = 1e-12) -> float:
+    """Regularised upper incomplete gamma Q(a, x). Series + continued fraction."""
+
+    if x < 0 or a <= 0:
+        return float("nan")
+    if x == 0:
+        return 1.0
+    if x < a + 1.0:
+        # Series for P(a, x); subtract from 1.
+        p_series = _gammainc_series(a, x, max_iter=max_iter, tol=tol)
+        return 1.0 - p_series
+    return _gammaincc_cf(a, x, max_iter=max_iter, tol=tol)
+
+
+def _gammainc_series(a: float, x: float, *, max_iter: int, tol: float) -> float:
+    term = 1.0 / a
+    summation = term
+    for n in range(1, max_iter):
+        term *= x / (a + n)
+        summation += term
+        if abs(term) < abs(summation) * tol:
+            break
+    log_pre = -x + a * math.log(x) - math.lgamma(a)
+    return summation * math.exp(log_pre)
+
+
+def _gammaincc_cf(a: float, x: float, *, max_iter: int, tol: float) -> float:
+    b = x + 1.0 - a
+    c_val = 1.0 / 1e-300
+    d = 1.0 / b
+    h = d
+    for i in range(1, max_iter):
+        an = -i * (i - a)
+        b += 2.0
+        d = an * d + b
+        if abs(d) < 1e-300:
+            d = 1e-300
+        c_val = b + an / c_val
+        if abs(c_val) < 1e-300:
+            c_val = 1e-300
+        d = 1.0 / d
+        delta = d * c_val
+        h *= delta
+        if abs(delta - 1.0) < tol:
+            break
+    log_pre = -x + a * math.log(x) - math.lgamma(a)
+    return h * math.exp(log_pre)
+
+
+# ---------------------------------------------------------------------------
+# Bootstrap delta CI
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class BootstrapDelta:
+    delta: float
+    ci_low: float
+    ci_high: float
+    n_resamples: int
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {
+            "delta": self.delta,
+            "ci_low": self.ci_low,
+            "ci_high": self.ci_high,
+            "n_resamples": self.n_resamples,
+        }
+
+
+def bootstrap_delta_ci(
+    arm_a_correct: Sequence[bool],
+    arm_b_correct: Sequence[bool],
+    *,
+    n_resamples: int = 5000,
+    level: float = 0.95,
+    random_state: int | None = 0,
+) -> BootstrapDelta:
+    """Paired-sample bootstrap CI for ``mean(arm_b) - mean(arm_a)``.
+
+    Resamples *paired indices* with replacement so the dependency
+    between arms is preserved.
+    """
+
+    if len(arm_a_correct) != len(arm_b_correct):
+        raise ValueError("paired arms must have the same length")
+    n = len(arm_a_correct)
+    if n == 0:
+        return BootstrapDelta(0.0, 0.0, 0.0, 0)
+    a = np.asarray(arm_a_correct, dtype=np.int8)
+    b = np.asarray(arm_b_correct, dtype=np.int8)
+    delta = float(b.mean() - a.mean())
+
+    rng = np.random.default_rng(random_state)
+    deltas = np.empty(n_resamples, dtype=np.float64)
+    for i in range(n_resamples):
+        idx = rng.integers(0, n, size=n)
+        deltas[i] = b[idx].mean() - a[idx].mean()
+    alpha = (1.0 - level) / 2.0
+    ci_low, ci_high = float(np.quantile(deltas, alpha)), float(np.quantile(deltas, 1 - alpha))
+    return BootstrapDelta(delta=delta, ci_low=ci_low, ci_high=ci_high, n_resamples=n_resamples)
+
+
+# ---------------------------------------------------------------------------
+# Simple aggregate helpers (cost / latency / tokens)
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class Aggregate:
+    mean: float
+    median: float
+    p95: float
+    n: int
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {"mean": self.mean, "median": self.median, "p95": self.p95, "n": self.n}
+
+
+def paired_aggregate(values: Sequence[float]) -> Aggregate:
+    """Mean / median / p95 of a list of numbers (e.g. cost-per-question)."""
+
+    if not values:
+        return Aggregate(0.0, 0.0, 0.0, 0)
+    arr = np.asarray(values, dtype=np.float64)
+    return Aggregate(
+        mean=float(arr.mean()),
+        median=float(statistics.median(values)),
+        p95=float(np.quantile(arr, 0.95)),
+        n=len(values),
+    )
+
+
+__all__ = [
+    "Aggregate",
+    "BootstrapDelta",
+    "McnemarResult",
+    "bootstrap_delta_ci",
+    "mcnemar_test",
+    "paired_aggregate",
+]
--- a/surfsense_evals/src/surfsense_evals/core/metrics/mc_accuracy.py
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/mc_accuracy.py
@ -0,0 +1,130 @@
+"""Multiple-choice accuracy + Wilson 95% confidence intervals.
+
+Wilson CI is preferred over normal-approximation because MIRAGE's
+per-task subsets can be small (PubMedQA* and BioASQ-Y/N have a few
+hundred questions each) and Wilson handles n→0 / p→{0,1} edges
+gracefully.
+
+Reference for the closed form: Wilson (1927); identical to the
+``statsmodels.stats.proportion.proportion_confint(method='wilson')``
+output and what scikit-learn implements internally for its bounded
+estimators.
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class AccuracyResult:
+    """Per-task accuracy with Wilson CI."""
+
+    n_correct: int
+    n_total: int
+    accuracy: float
+    ci_low: float
+    ci_high: float
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {
+            "n_correct": self.n_correct,
+            "n_total": self.n_total,
+            "accuracy": self.accuracy,
+            "ci_low": self.ci_low,
+            "ci_high": self.ci_high,
+        }
+
+
+# Two-sided Wilson z values. 1.959964 ≈ z_{0.975}.
+_Z_FOR_LEVEL: dict[float, float] = {
+    0.90: 1.6448536269514722,
+    0.95: 1.959963984540054,
+    0.99: 2.5758293035489004,
+}
+
+
+def wilson_ci(
+    n_correct: int, n_total: int, *, level: float = 0.95
+) -> tuple[float, float]:
+    """Two-sided Wilson score confidence interval for a proportion.
+
+    Returns ``(low, high)``. ``n_total == 0`` returns ``(0.0, 1.0)`` —
+    the maximally uncertain interval.
+    """
+
+    if n_total <= 0:
+        return 0.0, 1.0
+    if level not in _Z_FOR_LEVEL:
+        raise ValueError(f"Unsupported confidence level {level!r}")
+    z = _Z_FOR_LEVEL[level]
+    p = n_correct / n_total
+    n = n_total
+    denom = 1.0 + (z * z) / n
+    centre = (p + (z * z) / (2 * n)) / denom
+    half = (z / denom) * math.sqrt((p * (1 - p) / n) + (z * z) / (4 * n * n))
+    low = max(0.0, centre - half)
+    high = min(1.0, centre + half)
+    return low, high
+
+
+def accuracy_with_wilson_ci(
+    n_correct: int, n_total: int, *, level: float = 0.95
+) -> AccuracyResult:
+    if n_total < 0:
+        raise ValueError(f"n_total must be >= 0, got {n_total}")
+    if n_correct < 0 or n_correct > n_total:
+        raise ValueError(
+            f"n_correct must be in [0, n_total]; got n_correct={n_correct}, n_total={n_total}"
+        )
+    accuracy = (n_correct / n_total) if n_total > 0 else 0.0
+    low, high = wilson_ci(n_correct, n_total, level=level)
+    return AccuracyResult(
+        n_correct=n_correct,
+        n_total=n_total,
+        accuracy=accuracy,
+        ci_low=low,
+        ci_high=high,
+    )
+
+
+def per_task_accuracy(
+    rows: Sequence[Mapping[str, object]],
+    *,
+    task_key: str = "task",
+    correct_key: str = "is_correct",
+    level: float = 0.95,
+) -> dict[str, AccuracyResult]:
+    """Group ``rows`` by ``task_key`` and compute per-task ``AccuracyResult``.
+
+    ``rows[i][correct_key]`` must be truthy iff the answer was correct.
+    """
+
+    counts: dict[str, list[int]] = {}
+    for row in rows:
+        task = str(row.get(task_key, ""))
+        bucket = counts.setdefault(task, [0, 0])
+        bucket[1] += 1
+        if row.get(correct_key):
+            bucket[0] += 1
+    return {
+        task: accuracy_with_wilson_ci(c[0], c[1], level=level)
+        for task, c in counts.items()
+    }
+
+
+def macro_accuracy(per_task: Mapping[str, AccuracyResult]) -> float:
+    if not per_task:
+        return 0.0
+    return sum(r.accuracy for r in per_task.values()) / len(per_task)
+
+
+__all__ = [
+    "AccuracyResult",
+    "accuracy_with_wilson_ci",
+    "macro_accuracy",
+    "per_task_accuracy",
+    "wilson_ci",
+]
--- a/surfsense_evals/src/surfsense_evals/core/metrics/retrieval.py
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/retrieval.py
@ -0,0 +1,132 @@
+"""Retrieval metrics: Recall@k, MRR, nDCG@k.
+
+Used by CUREv1's runner to score the SurfSense arm against the
+benchmark's qrels. ``corpus_id`` is the canonical CUREv1 passage id
+(string); the runner maps SurfSense ``chunk_id`` → ``document_id`` →
+``corpus_id`` before calling these.
+
+Graded relevance (CUREv1 uses 0/1/2 grades) is honoured by ``ndcg_at_k``;
+``recall_at_k`` and ``mrr`` flatten anything > 0 to "relevant".
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class RetrievalScores:
+    """Aggregated retrieval scores."""
+
+    recall_at_k: dict[int, float]
+    mrr: float
+    ndcg_at_10: float
+    n_queries: int
+
+    def to_dict(self) -> dict:
+        return {
+            "recall_at_k": dict(self.recall_at_k),
+            "mrr": self.mrr,
+            "ndcg_at_10": self.ndcg_at_10,
+            "n_queries": self.n_queries,
+        }
+
+
+def recall_at_k(retrieved: Sequence[str], relevant: Iterable[str], k: int) -> float:
+    """Fraction of ``relevant`` documents found in ``retrieved[:k]``."""
+
+    if not relevant:
+        return 0.0
+    relevant_set = set(relevant)
+    if not relevant_set:
+        return 0.0
+    top_k = list(retrieved)[:k]
+    hits = sum(1 for doc in top_k if doc in relevant_set)
+    return hits / len(relevant_set)
+
+
+def mrr(retrieved: Sequence[str], relevant: Iterable[str]) -> float:
+    """Reciprocal rank of the first relevant doc, 0 if none found."""
+
+    relevant_set = set(relevant)
+    for rank, doc in enumerate(retrieved, start=1):
+        if doc in relevant_set:
+            return 1.0 / rank
+    return 0.0
+
+
+def _dcg_at_k(grades: Sequence[float], k: int) -> float:
+    s = 0.0
+    for i, grade in enumerate(grades[:k], start=1):
+        # Standard log-base-2 discount; gain = 2^grade - 1 for graded relevance.
+        s += (2.0 ** grade - 1.0) / math.log2(i + 1)
+    return s
+
+
+def ndcg_at_k(
+    retrieved: Sequence[str],
+    qrels: Mapping[str, float],
+    k: int,
+) -> float:
+    """nDCG@k against graded ``qrels`` (``{doc_id: grade}``).
+
+    Unjudged documents in ``retrieved`` contribute zero gain. The
+    ideal ordering is ``qrels`` sorted by grade descending.
+    """
+
+    if not qrels:
+        return 0.0
+    grades = [float(qrels.get(doc, 0.0)) for doc in retrieved]
+    dcg = _dcg_at_k(grades, k)
+    ideal = sorted(qrels.values(), reverse=True)
+    idcg = _dcg_at_k([float(g) for g in ideal], k)
+    if idcg == 0.0:
+        return 0.0
+    return dcg / idcg
+
+
+def score_run(
+    *,
+    per_query_retrieved: Mapping[str, Sequence[str]],
+    per_query_qrels: Mapping[str, Mapping[str, float]],
+    ks: Sequence[int] = (1, 5, 10, 32),
+    ndcg_k: int = 10,
+) -> RetrievalScores:
+    """Aggregate Recall@k, MRR, nDCG@k across a run.
+
+    ``per_query_retrieved`` maps ``query_id -> ordered list of doc ids``.
+    ``per_query_qrels`` maps ``query_id -> {doc_id: grade}`` (grade > 0
+    is relevant).
+
+    Queries present in retrieved but not in qrels are skipped. Queries
+    in qrels but missing from retrieved contribute zeros.
+    """
+
+    qids = set(per_query_qrels.keys()) & set(per_query_retrieved.keys())
+    if not qids:
+        return RetrievalScores(recall_at_k={k: 0.0 for k in ks}, mrr=0.0, ndcg_at_10=0.0, n_queries=0)
+
+    recall_totals = {k: 0.0 for k in ks}
+    mrr_total = 0.0
+    ndcg_total = 0.0
+    for qid in qids:
+        retrieved = list(per_query_retrieved[qid])
+        qrels = per_query_qrels[qid]
+        relevant_docs = [d for d, g in qrels.items() if g > 0]
+        for k in ks:
+            recall_totals[k] += recall_at_k(retrieved, relevant_docs, k)
+        mrr_total += mrr(retrieved, relevant_docs)
+        ndcg_total += ndcg_at_k(retrieved, qrels, ndcg_k)
+
+    n = len(qids)
+    return RetrievalScores(
+        recall_at_k={k: v / n for k, v in recall_totals.items()},
+        mrr=mrr_total / n,
+        ndcg_at_10=ndcg_total / n,
+        n_queries=n,
+    )
+
+
+__all__ = ["RetrievalScores", "mrr", "ndcg_at_k", "recall_at_k", "score_run"]
--- a/surfsense_evals/src/surfsense_evals/core/parse/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/parse/init.py
@ -0,0 +1,21 @@
+"""Parsers shared across suites: citations, MCQ envelopes, AI-SDK SSE."""
+
+from __future__ import annotations
+
+from .answer_letter import AnswerLetterResult, extract_answer_letter
+from .citations import CITATION_REGEX, CitationToken, ChunkCitation, UrlCitation, parse_citations
+from .freeform_answer import extract_freeform_answer
+from .sse import SseEvent, iter_sse_events
+
+__all__ = [
+    "CITATION_REGEX",
+    "CitationToken",
+    "ChunkCitation",
+    "UrlCitation",
+    "parse_citations",
+    "AnswerLetterResult",
+    "extract_answer_letter",
+    "extract_freeform_answer",
+    "SseEvent",
+    "iter_sse_events",
+]
--- a/surfsense_evals/src/surfsense_evals/core/parse/answer_letter.py
+++ b/surfsense_evals/src/surfsense_evals/core/parse/answer_letter.py
@ -0,0 +1,122 @@
+"""Robust extractor for MCQ answer letters.
+
+Handles three answer shapes seen in the wild:
+
+1. **MedRAG envelope** — ``{"step_by_step_thinking": "...", "answer_choice": "A"}``
+   embedded somewhere in the assistant message (often inside ```` ```json ```` /
+   ``` ``` ``` fences). The regex grabs the JSON object and reads the
+   ``answer_choice`` field.
+
+2. **Final-line letter** — e.g. ``Answer: B`` or ``The correct answer is (C).``.
+   Falls back to a permissive regex over the last few lines.
+
+3. **Bare letter** — single uppercase letter at the end of the message.
+
+The function returns the parsed letter (uppercased) plus a discriminator
+of which strategy fired so the runner / report can flag suspicious
+parses (typically zero-confidence parses indicate the model didn't
+follow the prompt).
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+from typing import Literal
+
+ParserStrategy = Literal["json_envelope", "answer_line", "bare_letter", "none"]
+
+
+@dataclass(frozen=True)
+class AnswerLetterResult:
+    letter: str | None
+    strategy: ParserStrategy
+
+    @property
+    def found(self) -> bool:
+        return self.letter is not None
+
+
+# ---------------------------------------------------------------------------
+# Strategies
+# ---------------------------------------------------------------------------
+
+
+_JSON_BLOCK = re.compile(r"\{[^{}]*\"answer_choice\"\s*:\s*\"([A-Za-z])\"[^{}]*\}", re.DOTALL)
+_FENCED_JSON = re.compile(r"```(?:json)?\s*(\{.*?\})\s*```", re.DOTALL | re.IGNORECASE)
+_ANSWER_LINE = re.compile(
+    r"(?:final\s*answer|answer\s*choice|the\s+correct\s+answer\s+is|answer)\s*[:=\-]?\s*"
+    r"\(?\s*([A-Za-z])\s*[\)\.]*\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+_BARE_LETTER = re.compile(r"^\s*\(?\s*([A-Za-z])\s*[\)\.]*\s*$", re.MULTILINE)
+
+
+def _from_json_envelope(text: str) -> str | None:
+    # Try fenced code blocks first (most likely to contain the JSON).
+    for fence in _FENCED_JSON.finditer(text):
+        try:
+            obj = json.loads(fence.group(1))
+        except (json.JSONDecodeError, ValueError):
+            continue
+        if isinstance(obj, dict):
+            choice = obj.get("answer_choice")
+            if isinstance(choice, str) and choice.strip():
+                return choice.strip()[:1].upper()
+
+    # Fall back to a tolerant regex over the whole text (handles
+    # responses that drop the fences).
+    match = _JSON_BLOCK.search(text)
+    if match:
+        return match.group(1).upper()
+    return None
+
+
+def _from_answer_line(text: str) -> str | None:
+    # Walk lines bottom-up; the answer is almost always near the end.
+    for match in reversed(list(_ANSWER_LINE.finditer(text))):
+        letter = match.group(1).upper()
+        if letter.isalpha():
+            return letter
+    return None
+
+
+def _from_bare_letter(text: str) -> str | None:
+    # Inspect only the final non-empty lines (avoid grabbing in-prose
+    # mentions of "A" or "I").
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    for ln in reversed(lines[-3:]):
+        match = _BARE_LETTER.match(ln)
+        if match:
+            return match.group(1).upper()
+    return None
+
+
+def extract_answer_letter(text: str) -> AnswerLetterResult:
+    """Run strategies in order and return the first hit.
+
+    Order: JSON envelope → final-answer-line regex → bare-letter
+    fallback. Empty / whitespace-only text returns
+    ``AnswerLetterResult(None, "none")``.
+    """
+
+    if not text or not text.strip():
+        return AnswerLetterResult(None, "none")
+
+    letter = _from_json_envelope(text)
+    if letter:
+        return AnswerLetterResult(letter, "json_envelope")
+
+    letter = _from_answer_line(text)
+    if letter:
+        return AnswerLetterResult(letter, "answer_line")
+
+    letter = _from_bare_letter(text)
+    if letter:
+        return AnswerLetterResult(letter, "bare_letter")
+
+    return AnswerLetterResult(None, "none")
+
+
+__all__ = ["AnswerLetterResult", "ParserStrategy", "extract_answer_letter"]
--- a/surfsense_evals/src/surfsense_evals/core/parse/citations.py
+++ b/surfsense_evals/src/surfsense_evals/core/parse/citations.py
@ -0,0 +1,110 @@
+"""Python port of the canonical citation parser.
+
+Source of truth: ``surfsense_web/lib/citations/citation-parser.ts:20-21``.
+The pattern is byte-for-byte identical to the TS export ``CITATION_REGEX``
+so a SurfSense user reading the web client and a CUREv1 retrieval scorer
+running here see the same chunk_ids extracted from the same answer.
+
+The TS reference also handles a ``urlcite{N}`` placeholder produced by
+``preprocessCitationMarkdown`` — that pre-processing step is web-only
+(GFM autolink workaround), so the harness sees raw ``[citation:URL]``
+tokens and ``parse_citations`` returns them as ``UrlCitation`` directly.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Any, Union
+
+# Pattern preserves the TS source verbatim:
+#   /[\[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g
+#
+# Notes:
+# * Matches both ASCII ``[]`` and Chinese fullwidth ``【】`` brackets.
+# * Allows an optional ZWSP (``\u200B``) just inside each bracket.
+# * ``citation:`` then EITHER a URL (anything not ``]``, ``】``, or ZWSP),
+#   OR a ``urlcite\d+`` placeholder, OR one or more comma-separated
+#   chunk ids (each optionally prefixed with ``doc-`` and optionally
+#   negative).
+# * URL char class deliberately excludes the closing brackets so a
+#   ``[citation:https://x.com]`` doesn't swallow the ``]``.
+# The ZWSP must be the actual code-point — the original TS source uses
+# the regex literal ``\u200B`` which the JS engine interprets as the
+# character. Python's ``re`` doesn't process the ``\u`` escape inside
+# the pattern source, so we splice the literal character in via an
+# f-string. This keeps our pattern functionally identical to the TS
+# reference and lets ``"\u200B" in CITATION_REGEX.pattern`` succeed.
+_ZWSP = "\u200B"
+CITATION_REGEX = re.compile(
+    rf"[\[【]{_ZWSP}?citation:\s*("
+    rf"https?://[^\]】{_ZWSP}]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*"
+    rf")\s*{_ZWSP}?[\]】]"
+)
+
+
+@dataclass(frozen=True)
+class ChunkCitation:
+    chunk_id: int
+    is_docs_chunk: bool
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "kind": "chunk",
+            "chunk_id": self.chunk_id,
+            "is_docs_chunk": self.is_docs_chunk,
+        }
+
+
+@dataclass(frozen=True)
+class UrlCitation:
+    url: str
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"kind": "url", "url": self.url}
+
+
+CitationToken = Union[ChunkCitation, UrlCitation]
+
+
+def parse_citations(text: str, *, url_map: dict[str, str] | None = None) -> list[CitationToken]:
+    """Return the citation tokens found in ``text`` in document order.
+
+    ``url_map`` is the optional ``urlciteN -> URL`` lookup that the web
+    client builds in its preprocessing step. The harness ordinarily
+    doesn't preprocess (we don't render the markdown, we score it), so
+    the default empty map means ``urlciteN`` placeholders are dropped
+    rather than mis-resolved to a missing URL.
+
+    Multi-id payloads like ``[citation:1, doc-2, -3]`` are flattened
+    into separate ``ChunkCitation`` entries — same as the TS reference.
+    """
+
+    out: list[CitationToken] = []
+    for match in CITATION_REGEX.finditer(text):
+        captured = match.group(1)
+        if captured.startswith("http://") or captured.startswith("https://"):
+            out.append(UrlCitation(url=captured.strip()))
+            continue
+        if captured.startswith("urlcite"):
+            if url_map and captured in url_map:
+                out.append(UrlCitation(url=url_map[captured]))
+            continue
+        for raw_id in (s.strip() for s in captured.split(",")):
+            is_docs_chunk = raw_id.startswith("doc-")
+            number_part = raw_id[4:] if is_docs_chunk else raw_id
+            try:
+                chunk_id = int(number_part)
+            except ValueError:
+                continue
+            out.append(ChunkCitation(chunk_id=chunk_id, is_docs_chunk=is_docs_chunk))
+    return out
+
+
+__all__ = [
+    "CITATION_REGEX",
+    "ChunkCitation",
+    "UrlCitation",
+    "CitationToken",
+    "parse_citations",
+]
--- a/surfsense_evals/src/surfsense_evals/core/parse/freeform_answer.py
+++ b/surfsense_evals/src/surfsense_evals/core/parse/freeform_answer.py
@ -0,0 +1,85 @@
+"""Extract free-form answers from open-ended LLM responses.
+
+Used by benchmarks that don't have a fixed letter set (MMLongBench-Doc,
+DocVQA-style benchmarks, future legal/finance suites). The contract:
+
+* Strip leading "Answer:" / "Final answer:" markers if present.
+* Drop fenced code blocks if the model wrapped its answer in one.
+* Trim leading/trailing whitespace.
+* Return the *last* meaningful chunk — models often think out loud
+  before stating the answer.
+
+If the message is empty or only contains a fence, return ``""``.
+"""
+
+from __future__ import annotations
+
+import re
+
+_ANSWER_PREFIX = re.compile(
+    r"^\s*(?:final\s*answer|the\s+answer\s+is|answer)\s*[:=\-]\s*",
+    re.IGNORECASE,
+)
+# Marker-only regex (no capture group) used to find every "Answer:"
+# token position. We then slice from the LAST marker's end to the
+# next newline ourselves — robust to multiple inline answers because
+# we never let the engine greedy-capture across markers.
+_ANSWER_MARKER = re.compile(
+    r"(?:final\s*answer|the\s+answer\s+is|answer)\s*[:=\-]\s*",
+    re.IGNORECASE,
+)
+_FENCED_BLOCK = re.compile(r"```[a-zA-Z0-9]*\s*([\s\S]*?)\s*```")
+
+
+def extract_freeform_answer(text: str) -> str:
+    """Pull the model's final answer out of a possibly-verbose response."""
+
+    if not text or not text.strip():
+        return ""
+
+    # 1. Find the last line that starts with an Answer: marker. If
+    #    nothing matches, walk back to the last non-empty line.
+    lines = [ln.rstrip() for ln in text.strip().splitlines()]
+    candidate = ""
+    for ln in reversed(lines):
+        if not ln.strip():
+            continue
+        if _ANSWER_PREFIX.search(ln):
+            candidate = _ANSWER_PREFIX.sub("", ln, count=1).strip()
+            break
+
+    if not candidate:
+        # 2. Inline match: find every "Answer:" marker position and
+        # slice from the LAST marker's end to the next newline. Robust
+        # to "preamble.Answer: 42" one-liners and multiple inline
+        # markers (we always pick the final, freshest one).
+        marker_matches = list(_ANSWER_MARKER.finditer(text))
+        if marker_matches:
+            last = marker_matches[-1]
+            tail = text[last.end():]
+            nl = tail.find("\n")
+            if nl >= 0:
+                tail = tail[:nl]
+            candidate = tail.strip()
+
+    if not candidate:
+        # 3. No "Answer:" marker — try fenced blocks.
+        fences = _FENCED_BLOCK.findall(text)
+        if fences:
+            candidate = fences[-1].strip()
+        else:
+            # Last non-empty line as a fallback.
+            for ln in reversed(lines):
+                if ln.strip():
+                    candidate = ln.strip()
+                    break
+
+    # 2. Strip wrapping quotes / parens / trailing punctuation that
+    #    confuse the grader without changing meaning.
+    candidate = candidate.strip().strip("`").strip()
+    if candidate.startswith(("\"", "'")) and candidate.endswith(("\"", "'")):
+        candidate = candidate[1:-1].strip()
+    return candidate
+
+
+__all__ = ["extract_freeform_answer"]
--- a/surfsense_evals/src/surfsense_evals/core/parse/sse.py
+++ b/surfsense_evals/src/surfsense_evals/core/parse/sse.py
@ -0,0 +1,72 @@
+"""Minimal SSE consumer compatible with SurfSense's wire format.
+
+SurfSense uses ``app/services/streaming/envelope/sse.py`` to frame events:
+
+* ``data: <single-line-string>\\n\\n``
+* ``data: <json-string>\\n\\n``  (most events)
+* ``data: [DONE]\\n\\n``  (terminator)
+
+There is no ``event:``, ``id:``, or ``retry:`` framing in production —
+``format_sse(payload)`` only emits the ``data:`` line. This implementation
+is therefore intentionally smaller than ``httpx-sse`` (which we still
+list as a dep so callers who want richer parsing can opt in): one event
+per ``data:`` line, separated by blank lines.
+
+We accept any line iterator (an ``httpx.Response.aiter_lines`` adapter
+in production, a list in tests) so this is unit-testable without a
+network mock.
+"""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class SseEvent:
+    """A parsed SSE event. Only the ``data`` field is populated.
+
+    Multi-line payloads (``data: a\\ndata: b``) are joined with ``\\n``
+    per the SSE spec, even though SurfSense doesn't currently emit them.
+    """
+
+    data: str
+
+
+async def iter_sse_events(lines: AsyncIterator[str]) -> AsyncIterator[SseEvent]:
+    """Yield one ``SseEvent`` per blank-line-terminated frame.
+
+    Lines that are empty or whitespace flush the buffer. ``data:`` lines
+    are accumulated into the buffer; everything else is ignored
+    (matches the lenient browser EventSource behaviour).
+    """
+
+    buffer: list[str] = []
+    async for raw in lines:
+        if raw is None:
+            continue
+        line = raw.rstrip("\r")
+        if line == "":
+            if buffer:
+                yield SseEvent(data="\n".join(buffer))
+                buffer.clear()
+            continue
+        if line.startswith(":"):
+            # comment / heartbeat
+            continue
+        if line.startswith("data:"):
+            # spec: optional single space after the colon.
+            payload = line[5:]
+            if payload.startswith(" "):
+                payload = payload[1:]
+            buffer.append(payload)
+            continue
+        # Any other field (event:, id:, retry:) is currently unused.
+        continue
+
+    if buffer:
+        yield SseEvent(data="\n".join(buffer))
+
+
+__all__ = ["SseEvent", "iter_sse_events"]
--- a/surfsense_evals/src/surfsense_evals/core/parsers/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/init.py
@ -0,0 +1,35 @@
+"""Direct parser invocations for the parser_compare benchmark.
+
+The SurfSense backend exposes a single ``ETL_SERVICE`` env var that
+picks one parser globally; per-ingestion overrides are not on the
+public API. To drive the four (Azure DI x basic/premium, LlamaCloud x
+basic/premium) extractions we need for ``multimodal_doc/parser_compare``
+we therefore call the Azure DI and LlamaCloud SDKs directly from the
+eval harness, mirroring the production code path in
+``surfsense_backend/app/etl_pipeline/parsers/``.
+
+Two design rules:
+
+* No backend imports — the eval harness cannot pull in the FastAPI
+  app's config layer (it would require the full backend ``.env`` plus a
+  reachable Postgres). We re-read keys from our own environment instead.
+* Same wire shape as the backend's parsers (Azure ``prebuilt-read`` /
+  ``prebuilt-layout`` selected by ``processing_mode``; LlamaCloud
+  ``parse_page_with_llm`` / ``parse_page_with_agent`` selected by
+  ``processing_mode``) so any quality conclusions transfer back to
+  production behaviour.
+"""
+
+from __future__ import annotations
+
+from .azure_di import AzureDIError, parse_with_azure_di
+from .llamacloud import LlamaCloudError, parse_with_llamacloud
+from .pdf_pages import count_pdf_pages
+
+__all__ = [
+    "AzureDIError",
+    "LlamaCloudError",
+    "count_pdf_pages",
+    "parse_with_azure_di",
+    "parse_with_llamacloud",
+]
--- a/surfsense_evals/src/surfsense_evals/core/parsers/azure_di.py
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/azure_di.py
@ -0,0 +1,144 @@
+"""Azure Document Intelligence parser — eval-side mirror of the backend.
+
+Calls ``DocumentIntelligenceClient.begin_analyze_document`` with one
+of two ``model_id`` slugs depending on ``processing_mode``:
+
+* ``basic``   → ``prebuilt-read``   (text OCR only, cheaper, faster)
+* ``premium`` → ``prebuilt-layout`` (text + tables + structure;
+                                     produces real markdown headings,
+                                     pipe-tables, etc.)
+
+These are the same model selections the production
+``surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py``
+makes per ``processing_mode``. Output format is forced to Markdown
+(``DocumentContentFormat.MARKDOWN``) so the long-context arm can stuff
+it into a prompt verbatim.
+
+Retry policy is intentionally light here (the eval harness re-runs
+the whole batch on top-level failure); we do one synchronous attempt
+plus exponential backoff on transient transport errors.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import random
+
+logger = logging.getLogger(__name__)
+
+
+_AZURE_MODEL_BY_MODE = {
+    "basic": "prebuilt-read",
+    "premium": "prebuilt-layout",
+}
+
+_MAX_RETRIES = 4
+_BASE_DELAY = 5.0
+_MAX_DELAY = 60.0
+
+
+class AzureDIError(RuntimeError):
+    """Raised when Azure DI fails after all retries."""
+
+
+async def parse_with_azure_di(
+    file_path: str | os.PathLike,
+    *,
+    processing_mode: str = "basic",
+    endpoint: str | None = None,
+    api_key: str | None = None,
+) -> str:
+    """Run Azure DI on ``file_path`` and return the markdown content.
+
+    ``endpoint`` / ``api_key`` default to ``AZURE_DI_ENDPOINT`` and
+    ``AZURE_DI_KEY`` env vars (set in ``surfsense_evals/.env``).
+
+    Raises ``AzureDIError`` after exhausting retries; ``ValueError`` if
+    credentials are missing.
+    """
+
+    endpoint = endpoint or os.environ.get("AZURE_DI_ENDPOINT")
+    api_key = api_key or os.environ.get("AZURE_DI_KEY")
+    if not endpoint or not api_key:
+        raise ValueError(
+            "AZURE_DI_ENDPOINT and AZURE_DI_KEY must be set "
+            "(see surfsense_evals/.env)."
+        )
+
+    model_id = _AZURE_MODEL_BY_MODE.get(processing_mode, "prebuilt-read")
+
+    # Lazy imports — surfsense_evals shouldn't pay the azure-sdk
+    # import cost on every CLI invocation that doesn't touch
+    # parser_compare.
+    from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
+    from azure.ai.documentintelligence.models import DocumentContentFormat
+    from azure.core.credentials import AzureKeyCredential
+    from azure.core.exceptions import (
+        ClientAuthenticationError,
+        HttpResponseError,
+        ServiceRequestError,
+        ServiceResponseError,
+    )
+
+    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+    logger.info(
+        "Azure DI parsing %s (mode=%s, model=%s, size=%.1fMB)",
+        file_path, processing_mode, model_id, file_size_mb,
+    )
+
+    last_exc: Exception | None = None
+    for attempt in range(1, _MAX_RETRIES + 1):
+        try:
+            client = DocumentIntelligenceClient(
+                endpoint=endpoint,
+                credential=AzureKeyCredential(api_key),
+            )
+            async with client:
+                with open(file_path, "rb") as fh:
+                    poller = await client.begin_analyze_document(
+                        model_id,
+                        body=fh,
+                        output_content_format=DocumentContentFormat.MARKDOWN,
+                    )
+                result = await poller.result()
+            content = (result.content or "").strip()
+            if not content:
+                raise AzureDIError(
+                    f"Azure DI returned empty content for {file_path}"
+                )
+            logger.info(
+                "Azure DI OK: %s (%s) -> %d chars",
+                file_path, model_id, len(content),
+            )
+            return content
+
+        except ClientAuthenticationError:
+            raise
+        except HttpResponseError as exc:
+            # 4xx that's not auth: don't retry, the request itself is broken.
+            if exc.status_code and 400 <= exc.status_code < 500:
+                raise AzureDIError(
+                    f"Azure DI {exc.status_code} on {file_path}: {exc}"
+                ) from exc
+            last_exc = exc
+        except (ServiceRequestError, ServiceResponseError) as exc:
+            last_exc = exc
+
+        if attempt < _MAX_RETRIES:
+            delay = min(_BASE_DELAY * (2 ** (attempt - 1)), _MAX_DELAY)
+            jitter = delay * 0.25 * (2 * random.random() - 1)
+            sleep_for = delay + jitter
+            logger.warning(
+                "Azure DI attempt %d/%d failed (%s); retrying in %.1fs",
+                attempt, _MAX_RETRIES, type(last_exc).__name__, sleep_for,
+            )
+            await asyncio.sleep(sleep_for)
+
+    raise AzureDIError(
+        f"Azure DI failed after {_MAX_RETRIES} attempts on {file_path}"
+    ) from last_exc
+
+
+__all__ = ["AzureDIError", "parse_with_azure_di"]
--- a/surfsense_evals/src/surfsense_evals/core/parsers/llamacloud.py
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/llamacloud.py
@ -0,0 +1,168 @@
+"""LlamaParse (LlamaCloud) parser — eval-side mirror of the backend.
+
+Calls ``LlamaParse.aparse`` with one of two ``parse_mode`` slugs
+depending on ``processing_mode``:
+
+* ``basic``   → ``parse_page_with_llm``   (cheap, single-LLM-call/page)
+* ``premium`` → ``parse_page_with_agent`` (multi-step agent per page;
+                                            handles tables / figures
+                                            substantially better)
+
+These are the exact mappings from production
+``surfsense_backend/app/etl_pipeline/parsers/llamacloud.py``. We keep
+``num_workers=1`` and language=``"en"`` to match production.
+
+The result is materialised via ``get_markdown_documents(split_by_page=False)``
+which concatenates every page into a single markdown string, exactly
+the shape we need for long-context stuffing.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import random
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_LLAMA_PARSE_MODE_MAP = {
+    "basic": "parse_page_with_llm",
+    "premium": "parse_page_with_agent",
+}
+
+_MAX_RETRIES = 3
+_BASE_DELAY = 10.0
+_MAX_DELAY = 90.0
+
+
+class LlamaCloudError(RuntimeError):
+    """Raised when LlamaCloud parse fails after all retries."""
+
+
+def _extract_markdown(result) -> str:
+    """Pull markdown out of whatever object LlamaParse.aparse returns.
+
+    Mirrors backend's tolerant extraction: the SDK has gone through
+    several response shapes; we accept all of them so a minor SDK bump
+    doesn't silently zero the eval.
+    """
+
+    if hasattr(result, "get_markdown_documents"):
+        docs = result.get_markdown_documents(split_by_page=False)
+        if docs and hasattr(docs[0], "text"):
+            return docs[0].text
+        if hasattr(result, "pages") and result.pages:
+            return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
+
+    if isinstance(result, list):
+        if result and hasattr(result[0], "text"):
+            return result[0].text
+        return "\n\n".join(
+            doc.page_content if hasattr(doc, "page_content") else str(doc)
+            for doc in result
+        )
+
+    return str(result)
+
+
+async def parse_with_llamacloud(
+    file_path: str | os.PathLike,
+    *,
+    processing_mode: str = "basic",
+    estimated_pages: int = 50,
+    api_key: str | None = None,
+) -> str:
+    """Run LlamaParse on ``file_path`` and return the markdown content.
+
+    ``api_key`` defaults to the ``LLAMA_CLOUD_API_KEY`` env var (set
+    in ``surfsense_evals/.env``).
+
+    Raises ``LlamaCloudError`` after exhausting retries; ``ValueError``
+    if the API key is missing.
+    """
+
+    api_key = api_key or os.environ.get("LLAMA_CLOUD_API_KEY")
+    if not api_key:
+        raise ValueError(
+            "LLAMA_CLOUD_API_KEY must be set (see surfsense_evals/.env)."
+        )
+
+    parse_mode = _LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
+
+    # Lazy import: llama-cloud pulls llama-index-core (~50 MB) on first
+    # touch; defer until the parser actually runs.
+    from llama_cloud_services import LlamaParse
+    from llama_cloud_services.parse.base import JobFailedException
+    from llama_cloud_services.parse.utils import ResultType
+
+    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+    # Match backend's per-page timeout heuristic so big PDFs don't drop
+    # mid-job: 60s baseline + 30s/page (premium agent runs longer than
+    # basic; both fit comfortably here).
+    job_timeout = max(180.0, 60.0 + 30.0 * estimated_pages)
+    upload_timeout = max(120.0, 30.0 * file_size_mb)
+
+    logger.info(
+        "LlamaCloud parsing %s (mode=%s, parse_mode=%s, %.1fMB, "
+        "job_timeout=%.0fs)",
+        file_path, processing_mode, parse_mode, file_size_mb, job_timeout,
+    )
+
+    custom_timeout = httpx.Timeout(
+        connect=120.0, read=upload_timeout, write=upload_timeout, pool=120.0,
+    )
+
+    last_exc: Exception | None = None
+    for attempt in range(1, _MAX_RETRIES + 1):
+        try:
+            async with httpx.AsyncClient(timeout=custom_timeout) as client:
+                parser = LlamaParse(
+                    api_key=api_key,
+                    num_workers=1,
+                    verbose=False,
+                    language="en",
+                    result_type=ResultType.MD,
+                    parse_mode=parse_mode,
+                    ignore_errors=False,
+                    max_timeout=int(max(2000.0, job_timeout + upload_timeout)),
+                    job_timeout_in_seconds=job_timeout,
+                    job_timeout_extra_time_per_page_in_seconds=60,
+                    custom_client=client,
+                )
+                result = await parser.aparse(str(file_path))
+            content = _extract_markdown(result).strip()
+            if not content:
+                raise LlamaCloudError(
+                    f"LlamaCloud returned empty content for {file_path}"
+                )
+            logger.info(
+                "LlamaCloud OK: %s (%s) -> %d chars",
+                file_path, parse_mode, len(content),
+            )
+            return content
+
+        except (
+            httpx.HTTPError,
+            JobFailedException,
+            RuntimeError,
+        ) as exc:
+            last_exc = exc
+            if attempt < _MAX_RETRIES:
+                delay = min(_BASE_DELAY * (2 ** (attempt - 1)), _MAX_DELAY)
+                jitter = delay * 0.25 * (2 * random.random() - 1)
+                sleep_for = delay + jitter
+                logger.warning(
+                    "LlamaCloud attempt %d/%d failed (%s); retrying in %.1fs",
+                    attempt, _MAX_RETRIES, type(last_exc).__name__, sleep_for,
+                )
+                await asyncio.sleep(sleep_for)
+
+    raise LlamaCloudError(
+        f"LlamaCloud failed after {_MAX_RETRIES} attempts on {file_path}"
+    ) from last_exc
+
+
+__all__ = ["LlamaCloudError", "parse_with_llamacloud"]
--- a/surfsense_evals/src/surfsense_evals/core/parsers/pdf_pages.py
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/pdf_pages.py
@ -0,0 +1,35 @@
+"""Tiny pypdf wrapper for "how many pages does this PDF have?".
+
+Used by ``parser_compare`` to:
+
+* Decide LlamaCloud's per-page job timeout.
+* Compute the SurfSense preprocessing dollar cost
+  (``$1 / 1k pages`` for basic, ``$10 / 1k pages`` for premium) so the
+  report can show "ingest + LLM" total cost per arm.
+
+Returns ``0`` (and logs) on parse failure rather than raising — costs
+shown as ``?`` are always better than a benchmark that crashes mid-run.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def count_pdf_pages(path: Path) -> int:
+    """Return the page count for ``path``; ``0`` if pypdf can't open it."""
+
+    try:
+        from pypdf import PdfReader
+
+        reader = PdfReader(str(path))
+        return len(reader.pages)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("Failed to count pages for %s: %s", path, exc)
+        return 0
+
+
+__all__ = ["count_pdf_pages"]
--- a/surfsense_evals/src/surfsense_evals/core/pdf/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/pdf/init.py
@ -0,0 +1,31 @@
+"""Domain-agnostic PDF rendering helper. Lazy import."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .render import (
+        PdfImage,
+        render_pdf,
+        render_pdf_with_images,
+        render_text_files_to_pdf,
+    )
+
+__all__ = [
+    "PdfImage",
+    "render_pdf",
+    "render_pdf_with_images",
+    "render_text_files_to_pdf",
+]
+
+
+_LAZY = {"PdfImage", "render_pdf", "render_pdf_with_images", "render_text_files_to_pdf"}
+
+
+def __getattr__(name: str):
+    if name in _LAZY:
+        from . import render as _mod
+
+        return getattr(_mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.pdf' has no attribute {name!r}")
--- a/surfsense_evals/src/surfsense_evals/core/pdf/render.py
+++ b/surfsense_evals/src/surfsense_evals/core/pdf/render.py
@ -0,0 +1,351 @@
+"""Deterministic ``.txt`` / ``.md`` → single PDF via reportlab.
+
+Used wherever a benchmark needs the same source bytes fed to both the
+native-PDF arm and the SurfSense ingestion arm. The head-to-head
+comparison is fair only if the *same* PDF is the input to both arms,
+which is why we go to lengths to make the rendering deterministic.
+
+Determinism notes:
+
+* We pin the PDF metadata to a fixed creation date and producer
+  (``reportlab`` accepts neither directly, but ``Canvas.setAuthor`` and
+  the absence of an ``info`` mutator means the bytes only differ by
+  ``CreationDate`` / ``ModDate``). We post-process the PDF to scrub
+  those if ``deterministic=True`` is passed.
+* Page size, font, margins, and tab handling are fixed in code so the
+  same input yields the same byte output across machines.
+* PDF/A is overkill for our use; basic PDF 1.4 is what every model
+  expects.
+"""
+
+from __future__ import annotations
+
+import io
+import re
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+
+from reportlab.lib.pagesizes import LETTER
+from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
+from reportlab.lib.units import inch
+from reportlab.lib.utils import ImageReader
+from reportlab.platypus import (
+    Image,
+    KeepTogether,
+    PageBreak,
+    Paragraph,
+    SimpleDocTemplate,
+    Spacer,
+)
+
+
+@dataclass
+class RenderedPdf:
+    path: Path
+    n_pages_estimate: int
+    n_chars: int
+
+
+_PDF_DATE_KEY = re.compile(rb"/(?:CreationDate|ModDate)\s*\(D:[^)]*\)")
+# reportlab also writes a `/ID [<hex1><hex2>]` trailer entry that
+# embeds a per-run hash. Scrub it so two renders of the same input
+# produce the same bytes.
+_PDF_ID_ARRAY = re.compile(rb"/ID\s*\[\s*<[^>]*>\s*<[^>]*>\s*\]")
+
+
+def _scrub_dates(pdf_bytes: bytes) -> bytes:
+    """Remove ``CreationDate`` / ``ModDate`` / trailer ``/ID`` so the
+    file is byte-deterministic across runs."""
+
+    pdf_bytes = _PDF_DATE_KEY.sub(b"/CreationDate (D:19700101000000Z)", pdf_bytes)
+    pdf_bytes = _PDF_ID_ARRAY.sub(b"/ID [<00><00>]", pdf_bytes)
+    return pdf_bytes
+
+
+_DEFAULT_STYLES = getSampleStyleSheet()
+
+
+def _build_body_style() -> ParagraphStyle:
+    base = _DEFAULT_STYLES["BodyText"]
+    style = ParagraphStyle(
+        "EvalBody",
+        parent=base,
+        fontName="Helvetica",
+        fontSize=10.5,
+        leading=14,
+        spaceAfter=6,
+        spaceBefore=0,
+    )
+    return style
+
+
+def _build_heading_style() -> ParagraphStyle:
+    base = _DEFAULT_STYLES["Heading2"]
+    style = ParagraphStyle(
+        "EvalHeading",
+        parent=base,
+        fontName="Helvetica-Bold",
+        fontSize=14,
+        leading=18,
+        spaceAfter=10,
+        spaceBefore=8,
+    )
+    return style
+
+
+def _normalise_paragraphs(text: str) -> list[str]:
+    """Split a text blob into paragraphs while preserving blank-line structure."""
+
+    blocks: list[list[str]] = [[]]
+    for line in text.splitlines():
+        stripped = line.rstrip()
+        if stripped == "":
+            if blocks[-1]:
+                blocks.append([])
+            continue
+        blocks[-1].append(stripped)
+    paragraphs: list[str] = []
+    for block in blocks:
+        if not block:
+            continue
+        # Join lines within a paragraph with spaces (text-from-PDF style).
+        paragraphs.append(" ".join(block))
+    return paragraphs
+
+
+def _escape_html(text: str) -> str:
+    return (
+        text.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+    )
+
+
+def render_pdf(
+    *,
+    title: str,
+    sections: Sequence[tuple[str | None, str]],
+    output_path: Path,
+    deterministic: bool = True,
+) -> RenderedPdf:
+    """Render one PDF from a list of ``(section_heading, section_text)`` tuples.
+
+    ``section_heading`` may be ``None`` for an unnamed section. Each
+    section is followed by a page break so the model's PDF parser sees
+    a clean structural boundary between source files.
+    """
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    buffer = io.BytesIO()
+    doc = SimpleDocTemplate(
+        buffer,
+        pagesize=LETTER,
+        leftMargin=0.75 * inch,
+        rightMargin=0.75 * inch,
+        topMargin=0.75 * inch,
+        bottomMargin=0.75 * inch,
+        title=title,
+        author="surfsense-evals",
+        subject="Eval input",
+        creator="surfsense-evals",
+    )
+
+    body_style = _build_body_style()
+    heading_style = _build_heading_style()
+    title_style = ParagraphStyle(
+        "EvalTitle",
+        parent=_DEFAULT_STYLES["Title"],
+        fontName="Helvetica-Bold",
+        fontSize=18,
+        leading=22,
+        spaceAfter=14,
+    )
+
+    flow: list = [Paragraph(_escape_html(title), title_style)]
+    total_chars = 0
+    for index, (heading, text) in enumerate(sections):
+        if index > 0:
+            flow.append(PageBreak())
+        if heading:
+            flow.append(Paragraph(_escape_html(heading), heading_style))
+        for paragraph in _normalise_paragraphs(text):
+            total_chars += len(paragraph)
+            flow.append(Paragraph(_escape_html(paragraph), body_style))
+            flow.append(Spacer(1, 4))
+
+    doc.build(flow)
+    pdf_bytes = buffer.getvalue()
+    if deterministic:
+        pdf_bytes = _scrub_dates(pdf_bytes)
+    output_path.write_bytes(pdf_bytes)
+
+    # Conservative page estimate: ~3000 chars per LETTER page at 10.5pt.
+    n_pages = max(1, total_chars // 3000 + len(sections))
+    return RenderedPdf(path=output_path, n_pages_estimate=n_pages, n_chars=total_chars)
+
+
+@dataclass
+class PdfImage:
+    """One image to embed inside a section.
+
+    ``caption`` is rendered below the image (italic). ``max_width_in``
+    caps the rendered width in inches; height auto-scales to preserve
+    aspect ratio (read with PIL).
+    """
+
+    path: Path
+    caption: str = ""
+    max_width_in: float = 5.5  # default leaves margin for LETTER 8.5"
+
+
+def _make_image_flowable(image: PdfImage) -> Image:
+    """Build a reportlab Image flowable scaled to fit page width."""
+
+    reader = ImageReader(str(image.path))
+    iw, ih = reader.getSize()
+    if iw <= 0 or ih <= 0:
+        raise ValueError(f"Invalid image dimensions for {image.path}: {iw}x{ih}")
+    target_w = image.max_width_in * inch
+    target_h = target_w * (ih / iw)
+    # Cap height too — some medical images are extreme portrait.
+    max_h = 7.0 * inch
+    if target_h > max_h:
+        target_h = max_h
+        target_w = target_h * (iw / ih)
+    return Image(str(image.path), width=target_w, height=target_h)
+
+
+def render_pdf_with_images(
+    *,
+    title: str,
+    sections: Sequence[tuple[str | None, str, Sequence[PdfImage] | None]],
+    output_path: Path,
+    deterministic: bool = True,
+    page_break_between_sections: bool = False,
+) -> RenderedPdf:
+    """Render a PDF that mixes text and embedded images.
+
+    Each section is ``(heading, body_text, images)``. Images render
+    inline after the body text, each followed by an italic caption.
+    Set ``page_break_between_sections=True`` if you want explicit
+    structural boundaries (mostly useful for multi-case PDFs); the
+    default keeps everything on one page when possible (so a single
+    MedXpertQA case is one PDF page with case + images + options).
+    """
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    buffer = io.BytesIO()
+    doc = SimpleDocTemplate(
+        buffer,
+        pagesize=LETTER,
+        leftMargin=0.75 * inch,
+        rightMargin=0.75 * inch,
+        topMargin=0.75 * inch,
+        bottomMargin=0.75 * inch,
+        title=title,
+        author="surfsense-evals",
+        subject="Eval input",
+        creator="surfsense-evals",
+    )
+
+    body_style = _build_body_style()
+    heading_style = _build_heading_style()
+    caption_style = ParagraphStyle(
+        "EvalCaption",
+        parent=body_style,
+        fontSize=9,
+        leading=11,
+        textColor="#444",
+        spaceBefore=2,
+        spaceAfter=10,
+    )
+    title_style = ParagraphStyle(
+        "EvalTitle",
+        parent=_DEFAULT_STYLES["Title"],
+        fontName="Helvetica-Bold",
+        fontSize=18,
+        leading=22,
+        spaceAfter=14,
+    )
+
+    flow: list = [Paragraph(_escape_html(title), title_style)]
+    total_chars = 0
+    for index, (heading, text, images) in enumerate(sections):
+        if index > 0 and page_break_between_sections:
+            flow.append(PageBreak())
+        if heading:
+            flow.append(Paragraph(_escape_html(heading), heading_style))
+        for paragraph in _normalise_paragraphs(text):
+            total_chars += len(paragraph)
+            flow.append(Paragraph(_escape_html(paragraph), body_style))
+            flow.append(Spacer(1, 4))
+        for image in images or []:
+            try:
+                img_flow = _make_image_flowable(image)
+            except Exception:  # noqa: BLE001 — bad image shouldn't kill PDF
+                continue
+            grouped = [img_flow]
+            if image.caption:
+                grouped.append(Paragraph(_escape_html(image.caption), caption_style))
+            else:
+                grouped.append(Spacer(1, 8))
+            flow.append(KeepTogether(grouped))
+
+    doc.build(flow)
+    pdf_bytes = buffer.getvalue()
+    if deterministic:
+        pdf_bytes = _scrub_dates(pdf_bytes)
+    output_path.write_bytes(pdf_bytes)
+
+    n_pages = max(1, total_chars // 3000 + len(sections))
+    return RenderedPdf(path=output_path, n_pages_estimate=n_pages, n_chars=total_chars)
+
+
+def render_text_files_to_pdf(
+    *,
+    title: str,
+    files: Iterable[Path],
+    output_path: Path,
+    deterministic: bool = True,
+) -> RenderedPdf:
+    """Convenience wrapper: read a list of text files, render to one PDF.
+
+    The heading of each section is the file's name (no extension), so
+    e.g. ``admission_note.txt`` becomes a section header ``admission_note``
+    in the rendered PDF. Useful for any text-only benchmark that ships
+    a corpus as separate ``.txt`` / ``.md`` shards per logical document.
+    """
+
+    sections: list[tuple[str | None, str]] = []
+    for path in files:
+        path = Path(path)
+        text = path.read_text(encoding="utf-8")
+        sections.append((path.stem, text))
+    return render_pdf(
+        title=title,
+        sections=sections,
+        output_path=output_path,
+        deterministic=deterministic,
+    )
+
+
+# Tiny self-check — handy when debugging.
+def _self_test() -> None:  # pragma: no cover
+    out = Path("./_render_self_test.pdf")
+    sections = [
+        ("intro", "Hello world.\n\nThis is a test."),
+        ("body", "Line one.\nLine two."),
+    ]
+    rendered = render_pdf(title="Self test", sections=sections, output_path=out)
+    print(f"wrote {rendered.path} ({rendered.n_chars} chars)")
+
+
+# Importing ``datetime`` keeps the timezone helper handy if a future
+# benchmark wants to embed a real timestamp without losing determinism.
+_NOW_FROZEN = datetime(2026, 5, 11, tzinfo=UTC)
--- a/surfsense_evals/src/surfsense_evals/core/providers/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/providers/init.py
@ -0,0 +1,22 @@
+"""External LLM providers (used by the native arm).
+
+Lazy imports so the SurfSense-only path doesn't transitively load the
+OpenRouter client until something actually constructs ``OpenRouterPdfProvider``.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .openrouter_pdf import OpenRouterPdfProvider, OpenRouterResponse
+
+__all__ = ["OpenRouterPdfProvider", "OpenRouterResponse"]
+
+
+def __getattr__(name: str):
+    if name in {"OpenRouterPdfProvider", "OpenRouterResponse"}:
+        from . import openrouter_pdf as _mod
+
+        return getattr(_mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.providers' has no attribute {name!r}")
--- a/surfsense_evals/src/surfsense_evals/core/providers/openrouter_chat.py
+++ b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_chat.py
@ -0,0 +1,118 @@
+"""Bare OpenRouter ``chat/completions`` provider — no PDF, no plugins.
+
+Used by ``BareLlmArm`` to measure "what does the model answer with
+zero retrieval context?". Same wire shape as ``OpenRouterPdfProvider``
+minus the file-parser plugin and the ``file`` content part:
+
+```json
+{
+  "model": "openai/gpt-5.4-mini",
+  "messages": [
+    {"role": "system", "content": "<optional>"},
+    {"role": "user",   "content": "<prompt>"}
+  ]
+}
+```
+
+The response shape is identical to the PDF provider's, so we re-use
+``_parse_chat_completion`` from ``openrouter_pdf`` and only specialise
+the request builder. That keeps cost-extraction, token-counting, and
+content-array handling in one place.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import Any
+
+import httpx
+
+from .openrouter_pdf import (
+    OpenRouterResponse,
+    _DEFAULT_HEADERS,
+    _parse_chat_completion,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class OpenRouterChatProvider:
+    """Stateless bare-chat client. No PDF, no file-parser plugin."""
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        model: str,
+        timeout_s: float = 600.0,
+    ) -> None:
+        if not api_key:
+            raise ValueError("OPENROUTER_API_KEY is required for the bare-LLM arm.")
+        self._api_key = api_key
+        self._base = base_url.rstrip("/")
+        self._model = model
+        self._timeout = httpx.Timeout(timeout_s, connect=15.0)
+
+    @property
+    def model(self) -> str:
+        return self._model
+
+    def _build_payload(
+        self,
+        *,
+        prompt: str,
+        system_prompt: str | None,
+        max_tokens: int | None,
+    ) -> dict[str, Any]:
+        messages: list[dict[str, Any]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": prompt})
+        body: dict[str, Any] = {"model": self._model, "messages": messages}
+        if max_tokens:
+            body["max_tokens"] = max_tokens
+        return body
+
+    async def complete(
+        self,
+        *,
+        prompt: str,
+        system_prompt: str | None = None,
+        max_tokens: int | None = None,
+        http: httpx.AsyncClient | None = None,
+    ) -> OpenRouterResponse:
+        """Single chat completion. Errors are raised verbatim — caller decides retries."""
+
+        payload = self._build_payload(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            max_tokens=max_tokens,
+        )
+        headers = {
+            "Authorization": f"Bearer {self._api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            **_DEFAULT_HEADERS,
+        }
+        url = f"{self._base}/chat/completions"
+        started = time.monotonic()
+        if http is not None:
+            response = await http.post(url, json=payload, headers=headers, timeout=self._timeout)
+        else:
+            async with httpx.AsyncClient(timeout=self._timeout) as client:
+                response = await client.post(
+                    url, json=payload, headers=headers, timeout=self._timeout
+                )
+        latency_ms = int((time.monotonic() - started) * 1000)
+        if response.status_code >= 400:
+            raise httpx.HTTPStatusError(
+                f"OpenRouter HTTP {response.status_code}: {response.text[:300]}",
+                request=response.request,
+                response=response,
+            )
+        return _parse_chat_completion(response.json(), latency_ms=latency_ms)
+
+
+__all__ = ["OpenRouterChatProvider"]
--- a/surfsense_evals/src/surfsense_evals/core/providers/openrouter_pdf.py
+++ b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_pdf.py
@ -0,0 +1,231 @@
+"""Native-PDF arm provider: OpenRouter ``chat/completions`` with PDF input.
+
+Per `<https://openrouter.ai/docs/features/multimodal/pdfs>`__ the wire
+shape is OpenAI-compatible with one PDF-specific extra:
+
+```json
+{
+  "model": "anthropic/claude-sonnet-4.5",
+  "messages": [{
+    "role": "user",
+    "content": [
+      {"type": "file", "file": {"filename": "case.pdf",
+        "file_data": "data:application/pdf;base64,<b64>"}},
+      {"type": "text", "text": "<prompt>"}
+    ]
+  }],
+  "plugins": [{"id": "file-parser", "pdf": {"engine": "native"}}]
+}
+```
+
+``engine: "native"`` is the only engine that doesn't pre-OCR the
+PDF — it forwards raw bytes to PDF-native models (Claude, Gemini),
+matching what a human user does when "dropping the PDF into Claude".
+``mistral-ocr`` and ``cloudflare-ai`` are exposed as enum options for
+non-native models.
+
+Headers ``HTTP-Referer`` and ``X-Title`` make spend show up cleanly on
+the OpenRouter dashboard.
+"""
+
+from __future__ import annotations
+
+import base64
+import logging
+import time
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+class PdfEngine(str, Enum):
+    NATIVE = "native"
+    MISTRAL_OCR = "mistral-ocr"
+    CLOUDFLARE_AI = "cloudflare-ai"
+
+
+@dataclass
+class OpenRouterResponse:
+    """Subset of the OpenRouter response we care about for scoring."""
+
+    text: str
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+    cost_micros: int
+    latency_ms: int
+    finish_reason: str | None
+    raw: dict[str, Any]
+
+
+_DEFAULT_HEADERS = {
+    "HTTP-Referer": "https://github.com/MODSetter/SurfSense",
+    "X-Title": "SurfSense-evals",
+}
+
+
+class OpenRouterPdfProvider:
+    """Thin httpx-based client. Stateless; safe to reuse per arm instance."""
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        model: str,
+        engine: PdfEngine = PdfEngine.NATIVE,
+        timeout_s: float = 600.0,
+    ) -> None:
+        if not api_key:
+            raise ValueError("OPENROUTER_API_KEY is required for the native arm.")
+        self._api_key = api_key
+        self._base = base_url.rstrip("/")
+        self._model = model
+        self._engine = engine
+        self._timeout = httpx.Timeout(timeout_s, connect=15.0)
+
+    @property
+    def model(self) -> str:
+        return self._model
+
+    @property
+    def engine(self) -> PdfEngine:
+        return self._engine
+
+    def _build_payload(
+        self,
+        *,
+        prompt: str,
+        pdf_path: Path,
+        max_tokens: int | None,
+        extra_messages: list[dict[str, Any]] | None,
+    ) -> dict[str, Any]:
+        b64 = base64.b64encode(pdf_path.read_bytes()).decode("ascii")
+        user_content: list[dict[str, Any]] = [
+            {
+                "type": "file",
+                "file": {
+                    "filename": pdf_path.name,
+                    "file_data": f"data:application/pdf;base64,{b64}",
+                },
+            },
+            {"type": "text", "text": prompt},
+        ]
+        messages: list[dict[str, Any]] = list(extra_messages or [])
+        messages.append({"role": "user", "content": user_content})
+        body: dict[str, Any] = {
+            "model": self._model,
+            "messages": messages,
+            "plugins": [
+                {"id": "file-parser", "pdf": {"engine": self._engine.value}}
+            ],
+        }
+        if max_tokens:
+            body["max_tokens"] = max_tokens
+        return body
+
+    async def complete(
+        self,
+        *,
+        prompt: str,
+        pdf_path: Path,
+        max_tokens: int | None = None,
+        extra_messages: list[dict[str, Any]] | None = None,
+        http: httpx.AsyncClient | None = None,
+    ) -> OpenRouterResponse:
+        """Single chat completion. Errors are raised verbatim — runner decides retries."""
+
+        payload = self._build_payload(
+            prompt=prompt,
+            pdf_path=pdf_path,
+            max_tokens=max_tokens,
+            extra_messages=extra_messages,
+        )
+        headers = {
+            "Authorization": f"Bearer {self._api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            **_DEFAULT_HEADERS,
+        }
+        url = f"{self._base}/chat/completions"
+        started = time.monotonic()
+        if http is not None:
+            response = await http.post(url, json=payload, headers=headers, timeout=self._timeout)
+        else:
+            async with httpx.AsyncClient(timeout=self._timeout) as client:
+                response = await client.post(
+                    url, json=payload, headers=headers, timeout=self._timeout
+                )
+        latency_ms = int((time.monotonic() - started) * 1000)
+        if response.status_code >= 400:
+            raise httpx.HTTPStatusError(
+                f"OpenRouter HTTP {response.status_code}: {response.text[:300]}",
+                request=response.request,
+                response=response,
+            )
+        data = response.json()
+        return _parse_chat_completion(data, latency_ms=latency_ms)
+
+
+def _parse_chat_completion(payload: dict[str, Any], *, latency_ms: int) -> OpenRouterResponse:
+    """Tolerant parser for OpenRouter / OpenAI chat-completions JSON.
+
+    OpenRouter passes through any provider-specific extras, but the
+    canonical shape is ``choices[0].message.content`` (string OR array
+    of content parts) and ``usage.prompt_tokens / completion_tokens / total_tokens``.
+    Cost lives at the top level (``payload["usage"]["cost"]`` or
+    ``payload["x-or-cost"]``) depending on routing.
+    """
+
+    text = ""
+    finish_reason: str | None = None
+    choices = payload.get("choices") or []
+    if choices:
+        message = (choices[0] or {}).get("message") or {}
+        content = message.get("content")
+        if isinstance(content, str):
+            text = content
+        elif isinstance(content, list):
+            chunks: list[str] = []
+            for part in content:
+                if isinstance(part, dict) and part.get("type") in {"text", "output_text"}:
+                    chunks.append(str(part.get("text", "")))
+            text = "".join(chunks)
+        finish_reason = (choices[0] or {}).get("finish_reason") or None
+
+    usage = payload.get("usage") or {}
+    input_tokens = int(usage.get("prompt_tokens") or 0)
+    output_tokens = int(usage.get("completion_tokens") or 0)
+    total_tokens = int(usage.get("total_tokens") or (input_tokens + output_tokens))
+
+    # OpenRouter exposes cost in dollars on `usage.cost` or `cost`. We
+    # convert to integer micros to avoid float-summing surprises across
+    # 7,663 MIRAGE questions.
+    raw_cost = usage.get("cost")
+    if raw_cost is None:
+        raw_cost = payload.get("cost")
+    cost_micros = 0
+    if raw_cost is not None:
+        try:
+            cost_micros = int(round(float(raw_cost) * 1_000_000))
+        except (TypeError, ValueError):
+            cost_micros = 0
+
+    return OpenRouterResponse(
+        text=text,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        total_tokens=total_tokens,
+        cost_micros=cost_micros,
+        latency_ms=latency_ms,
+        finish_reason=finish_reason,
+        raw=payload,
+    )
+
+
+__all__ = ["OpenRouterPdfProvider", "OpenRouterResponse", "PdfEngine"]
--- a/surfsense_evals/src/surfsense_evals/core/registry.py
+++ b/surfsense_evals/src/surfsense_evals/core/registry.py
@ -0,0 +1,265 @@
+"""Suite + Benchmark protocols and the global registry.
+
+The extensibility seam: ``core.cli`` walks ``surfsense_evals.suites`` on
+import, which auto-imports every benchmark subpackage, which calls
+``register(<benchmark>)`` at module bottom. The CLI then iterates the
+populated registry to build subcommand groups dynamically.
+
+Adding a new domain = drop a folder under ``suites/<domain>/<bench>/``
+that ends in ``register(MyBenchmark())``. No edits anywhere in
+``core/`` are required.
+"""
+
+from __future__ import annotations
+
+import argparse
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Protocol, runtime_checkable
+
+import httpx
+
+from .clients import DocumentsClient, NewChatClient, SearchSpaceClient
+from .config import Config, SuiteState
+
+# ---------------------------------------------------------------------------
+# Run context — what every benchmark.ingest/run receives
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class RunContext:
+    """Per-invocation environment threaded into ``ingest`` and ``run``.
+
+    A benchmark uses this to read pinned suite state, build new HTTP
+    clients on the shared ``http`` session, find the right data /
+    reports paths, and discover the active OpenRouter model + key.
+
+    ``http`` is the authenticated SurfSense client (auth event hook
+    attached). It is **not** an OpenRouter client — providers create
+    their own short-lived clients because OpenRouter doesn't share the
+    SurfSense bearer.
+    """
+
+    suite: str
+    benchmark: str
+    config: Config
+    suite_state: SuiteState
+    http: httpx.AsyncClient
+
+    @property
+    def search_space_id(self) -> int:
+        return self.suite_state.search_space_id
+
+    @property
+    def agent_llm_id(self) -> int:
+        return self.suite_state.agent_llm_id
+
+    @property
+    def provider_model(self) -> str:
+        """Slug used by the SurfSense agent (and the native arm by default).
+
+        For ``cost-arbitrage`` scenarios this is the *cheap, text-only*
+        slug — SurfSense answers from the chunks the vision LLM already
+        extracted at ingest. The native arm should use
+        ``native_arm_model`` instead in that scenario.
+        """
+
+        return self.suite_state.provider_model
+
+    @property
+    def native_arm_model(self) -> str:
+        """Slug the native_pdf arm should use.
+
+        Defaults to ``provider_model`` (head-to-head / symmetric-cheap);
+        for ``cost-arbitrage`` it returns the explicit
+        ``--native-arm-model`` so the native arm can fairly answer
+        image-bearing questions.
+        """
+
+        return self.suite_state.effective_native_arm_model
+
+    @property
+    def vision_provider_model(self) -> str | None:
+        """Slug of the OpenRouter vision LLM SurfSense used at ingest.
+
+        ``None`` if no vision config was attached at setup (legacy or
+        text-only suite). Used by runners purely to record what was
+        actually used in ``RunArtifact.extra`` and to label reports.
+        """
+
+        return self.suite_state.vision_provider_model
+
+    @property
+    def scenario(self) -> str:
+        """Scenario name pinned at setup time (see ``config.SCENARIOS``)."""
+
+        return self.suite_state.scenario
+
+    def search_space_client(self) -> SearchSpaceClient:
+        return SearchSpaceClient(self.http, self.config.surfsense_api_base)
+
+    def documents_client(self) -> DocumentsClient:
+        return DocumentsClient(self.http, self.config.surfsense_api_base)
+
+    def new_chat_client(self) -> NewChatClient:
+        return NewChatClient(self.http, self.config.surfsense_api_base)
+
+    def maps_dir(self) -> Path:
+        path = self.config.suite_maps_dir(self.suite)
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    def runs_dir(self, *, run_timestamp: str) -> Path:
+        path = self.config.suite_runs_dir(self.suite) / run_timestamp / self.benchmark
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    def benchmark_data_dir(self) -> Path:
+        path = self.config.suite_data_dir(self.suite) / self.benchmark
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+
+# ---------------------------------------------------------------------------
+# Run artifact + report section
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class RunArtifact:
+    """Everything a runner persists for the report writer to consume.
+
+    ``raw_path`` points at the JSONL of per-question ``ArmResult``
+    rows. ``metrics`` is a free-form dict the benchmark fills in (e.g.
+    ``{"native": {...}, "surfsense": {...}, "delta": {...}}``).
+    """
+
+    suite: str
+    benchmark: str
+    run_timestamp: str
+    raw_path: Path
+    metrics: dict[str, Any] = field(default_factory=dict)
+    extra: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ReportSection:
+    """One benchmark's slice of the final summary."""
+
+    title: str
+    headline: bool
+    body_md: str
+    body_json: dict[str, Any] = field(default_factory=dict)
+
+
+# ---------------------------------------------------------------------------
+# Benchmark protocol + registry
+# ---------------------------------------------------------------------------
+
+
+@runtime_checkable
+class Benchmark(Protocol):
+    """The contract every benchmark module ends with ``register(<x>)``."""
+
+    suite: str
+    name: str
+    headline: bool
+    description: str
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:  # pragma: no cover - protocol
+        ...
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:  # pragma: no cover - protocol
+        ...
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:  # pragma: no cover - protocol
+        """Add benchmark-specific flags to ``run <suite> <benchmark>``."""
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:  # pragma: no cover - protocol
+        ...
+
+
+# ---------------------------------------------------------------------------
+# Registry storage
+# ---------------------------------------------------------------------------
+
+
+_REGISTRY: dict[tuple[str, str], Benchmark] = {}
+
+
+def register(benchmark: Benchmark) -> None:
+    """Add ``benchmark`` to the registry. Last-wins on duplicate keys.
+
+    Duplicate registrations log a warning rather than raising so a
+    benchmark module imported twice (once via auto-discovery, once via
+    a test directly importing it) doesn't blow up the CLI.
+    """
+
+    key = (benchmark.suite, benchmark.name)
+    if key in _REGISTRY:
+        import logging
+
+        logging.getLogger(__name__).warning(
+            "Benchmark %s/%s re-registered (overwriting prior)", *key
+        )
+    _REGISTRY[key] = benchmark
+
+
+def unregister(suite: str, name: str) -> None:
+    """Test helper: drop a single benchmark from the registry."""
+
+    _REGISTRY.pop((suite, name), None)
+
+
+def reset() -> None:
+    """Test helper: wipe the registry (use with monkeypatched discovery)."""
+
+    _REGISTRY.clear()
+
+
+def get(suite: str, name: str) -> Benchmark:
+    try:
+        return _REGISTRY[(suite, name)]
+    except KeyError as exc:
+        available = ", ".join(f"{s}/{n}" for s, n in sorted(_REGISTRY)) or "<none>"
+        raise KeyError(
+            f"Unknown benchmark '{suite}/{name}'. Registered: {available}"
+        ) from exc
+
+
+def list_suites() -> list[str]:
+    return sorted({s for s, _ in _REGISTRY})
+
+
+def list_benchmarks(suite: str | None = None) -> list[Benchmark]:
+    if suite is None:
+        return [_REGISTRY[k] for k in sorted(_REGISTRY)]
+    return [_REGISTRY[k] for k in sorted(_REGISTRY) if k[0] == suite]
+
+
+def snapshot() -> Mapping[tuple[str, str], Benchmark]:
+    """Read-only view for diagnostics (e.g. ``benchmarks list`` rendering)."""
+
+    return dict(_REGISTRY)
+
+
+__all__ = [
+    "Arm",
+    "Benchmark",
+    "ReportSection",
+    "RunArtifact",
+    "RunContext",
+    "get",
+    "list_benchmarks",
+    "list_suites",
+    "register",
+    "reset",
+    "snapshot",
+    "unregister",
+]
+
+
+# Re-export Arm from arms.base so suites can `from core.registry import Arm`.
+from .arms.base import Arm  # noqa: E402, F401  (deliberate re-export at bottom)
--- a/surfsense_evals/src/surfsense_evals/core/report/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/report/init.py
@ -0,0 +1,18 @@
+"""Report writer + section composition primitives. Lazy import."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .writer import write_report
+
+__all__ = ["write_report"]
+
+
+def __getattr__(name: str):
+    if name == "write_report":
+        from .writer import write_report
+
+        return write_report
+    raise AttributeError(f"module 'surfsense_evals.core.report' has no attribute {name!r}")
--- a/surfsense_evals/src/surfsense_evals/core/report/writer.py
+++ b/surfsense_evals/src/surfsense_evals/core/report/writer.py
@ -0,0 +1,89 @@
+"""Report writer — composes per-benchmark sections into one summary.
+
+Output:
+
+* ``reports/<suite>/<run-timestamp>/summary.md`` — human-readable.
+  Bullet lists only (no tables) per project's coding-standards.
+* ``reports/<suite>/<run-timestamp>/summary.json`` — same content as
+  structured JSON for downstream tooling (CI dashboards, regressions).
+
+Headline benchmarks come first in both outputs.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterable
+from pathlib import Path
+
+from ..config import Config
+from ..registry import ReportSection
+
+
+def write_report(
+    *,
+    config: Config,
+    suite: str,
+    sections: Iterable[ReportSection],
+    run_timestamp: str,
+) -> Path:
+    """Write ``summary.md`` + ``summary.json``. Returns the path of the .md file."""
+
+    sections_list = list(sections)
+    sections_list.sort(key=lambda s: (not s.headline, s.title.lower()))
+
+    out_dir = config.suite_reports_dir(suite) / run_timestamp
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    md_path = out_dir / "summary.md"
+    json_path = out_dir / "summary.json"
+
+    md_lines: list[str] = [
+        f"# SurfSense evals — suite `{suite}`",
+        "",
+        f"- Run timestamp: `{run_timestamp}`",
+        f"- Sections: {len(sections_list)}",
+        "",
+    ]
+    headline = [s for s in sections_list if s.headline]
+    secondary = [s for s in sections_list if not s.headline]
+    if headline:
+        md_lines.append("## Headline")
+        md_lines.append("")
+        for section in headline:
+            md_lines.append(f"### {section.title}")
+            md_lines.append("")
+            md_lines.append(section.body_md.rstrip())
+            md_lines.append("")
+    if secondary:
+        md_lines.append("## Secondary measurements")
+        md_lines.append("")
+        for section in secondary:
+            md_lines.append(f"### {section.title}")
+            md_lines.append("")
+            md_lines.append(section.body_md.rstrip())
+            md_lines.append("")
+
+    md_path.write_text("\n".join(md_lines).rstrip() + "\n", encoding="utf-8")
+
+    json_payload = {
+        "suite": suite,
+        "run_timestamp": run_timestamp,
+        "sections": [
+            {
+                "title": s.title,
+                "headline": s.headline,
+                "body_md": s.body_md,
+                "body_json": s.body_json,
+            }
+            for s in sections_list
+        ],
+    }
+    json_path.write_text(
+        json.dumps(json_payload, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    return md_path
+
+
+__all__ = ["ReportSection", "write_report"]
--- a/surfsense_evals/src/surfsense_evals/core/scenarios.py
+++ b/surfsense_evals/src/surfsense_evals/core/scenarios.py
@ -0,0 +1,58 @@
+"""Shared scenario formatting helpers for head-to-head benchmark reports.
+
+The scenario chosen at ``setup`` time (``head-to-head``, ``symmetric-cheap``,
+``cost-arbitrage``) materially changes how a head-to-head report should be
+read. This module produces the one-bullet summary every head-to-head
+runner stamps near the top of its ``report_section`` body so reviewers
+immediately see the framing — no need to dig into ``run_artifact.json``.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from typing import Any
+
+
+def format_scenario_md(extra: Mapping[str, Any] | None) -> str:
+    """Render a scenario-aware bullet for a benchmark report.
+
+    Reads ``extra["scenario"]`` plus the runtime LLM slugs the runner
+    recorded. Falls back to a sensible "head-to-head" line if the artifact
+    pre-dates scenarios so old runs still render cleanly.
+    """
+
+    extra = dict(extra or {})
+    scenario = str(extra.get("scenario") or "head-to-head")
+    surf_slug = str(extra.get("provider_model") or "?")
+    native_slug = str(extra.get("native_arm_model") or surf_slug)
+    vision_slug = extra.get("vision_provider_model")
+
+    if scenario == "cost-arbitrage":
+        body = (
+            f"- Scenario: **cost-arbitrage** — native arm answers with "
+            f"`{native_slug}` (vision); SurfSense answers with `{surf_slug}` "
+            f"over chunks vision-extracted at ingest"
+            f"{f' by `{vision_slug}`' if vision_slug else ''}. "
+            "Measures how close SurfSense gets to native at a fraction of "
+            "the per-query cost."
+        )
+    elif scenario == "symmetric-cheap":
+        body = (
+            f"- Scenario: **symmetric-cheap** — both arms answer with "
+            f"`{surf_slug}`; SurfSense pre-extracted images at ingest"
+            f"{f' via `{vision_slug}`' if vision_slug else ''}. "
+            "Native arm structurally loses on image-bearing questions "
+            "(text-only model can't see images) — that's the point."
+        )
+    else:
+        body = (
+            f"- Scenario: head-to-head — both arms answer with `{surf_slug}` "
+            "via OpenRouter."
+        )
+        if vision_slug:
+            body += f" SurfSense ingest VLM: `{vision_slug}`."
+
+    return body
+
+
+__all__ = ["format_scenario_md"]
--- a/surfsense_evals/src/surfsense_evals/core/vision_llm.py
+++ b/surfsense_evals/src/surfsense_evals/core/vision_llm.py
@ -0,0 +1,127 @@
+"""Vision LLM resolution + auto-pick logic for the harness's ``setup`` command.
+
+Two responsibilities:
+
+1. Resolve an explicit ``--vision-llm <slug>`` to a global OpenRouter
+   vision LLM config id that ``set_llm_preferences(vision_llm_config_id=...)``
+   can accept.
+2. Auto-pick the strongest registered vision config when the operator
+   doesn't pass ``--vision-llm`` but the scenario / benchmark needs one.
+
+The priority list mirrors the recommended slugs in the README so the
+auto-pick is deterministic and reviewable.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+from .clients.search_space import VisionLlmConfigEntry
+
+# Order matters — first match wins when auto-picking. Keep these in sync
+# with the "Recommended vision slugs" table in the README so the
+# auto-pick story is the same one users read about.
+RECOMMENDED_VISION_PRIORITY: tuple[str, ...] = (
+    "anthropic/claude-sonnet-4.5",
+    "anthropic/claude-opus-4.7",
+    "openai/gpt-5",
+    "google/gemini-2.5-pro",
+)
+
+
+class VisionConfigError(RuntimeError):
+    """Raised when no vision config can be resolved (explicit or auto)."""
+
+
+@dataclass(frozen=True)
+class ResolvedVisionConfig:
+    """Result of ``resolve_vision_llm`` — what to attach + a label for logs."""
+
+    config_id: int
+    provider_model: str
+    selected_via: str  # "explicit" | "auto-priority" | "auto-fallback"
+
+
+def _openrouter_only(entries: Iterable[VisionLlmConfigEntry]) -> list[VisionLlmConfigEntry]:
+    return [e for e in entries if e.provider == "OPENROUTER" and not e.is_auto_mode]
+
+
+def resolve_vision_llm(
+    candidates: list[VisionLlmConfigEntry],
+    *,
+    explicit_slug: str | None,
+) -> ResolvedVisionConfig:
+    """Resolve a vision LLM config id from a slug or by auto-picking.
+
+    * If ``explicit_slug`` is given: must match exactly one OpenRouter
+      vision config's ``model_name``. Raises ``VisionConfigError`` with a
+      friendly listing if zero / many match.
+    * Otherwise: walk ``RECOMMENDED_VISION_PRIORITY`` in order and return
+      the first registered one. If none of the recommended slugs are
+      registered, fall back to the first OpenRouter vision config in the
+      list (deterministic by listing order). Raises ``VisionConfigError``
+      if zero are registered at all.
+    """
+
+    or_vision = _openrouter_only(candidates)
+
+    if explicit_slug is not None:
+        matches = [e for e in or_vision if e.model_name == explicit_slug]
+        if not matches:
+            sample = ", ".join(e.model_name for e in or_vision[:8]) or "<none>"
+            raise VisionConfigError(
+                f"No OpenRouter vision config found for slug '{explicit_slug}'. "
+                "Make sure `openrouter_integration.vision_enabled: true` in "
+                "global_llm_config.yaml and that the Celery worker has finished "
+                "its first refresh. "
+                f"Available OpenRouter vision slugs (sample): {sample}."
+            )
+        if len(matches) > 1:
+            listing = "\n".join(f"  id={e.id}  name={e.name!r}" for e in matches)
+            raise VisionConfigError(
+                f"Multiple OpenRouter vision configs match '{explicit_slug}':\n{listing}"
+            )
+        only = matches[0]
+        return ResolvedVisionConfig(
+            config_id=only.id,
+            provider_model=only.model_name,
+            selected_via="explicit",
+        )
+
+    if not or_vision:
+        raise VisionConfigError(
+            "No OpenRouter vision LLM configs are registered with this "
+            "SurfSense backend. Either pass `--no-vision-llm` to the ingest "
+            "step (text-only ingestion), or enable "
+            "`openrouter_integration.vision_enabled: true` in "
+            "global_llm_config.yaml so the Celery worker syncs vision-capable "
+            "OpenRouter models on next refresh."
+        )
+
+    by_slug = {e.model_name: e for e in or_vision}
+    for preferred in RECOMMENDED_VISION_PRIORITY:
+        match = by_slug.get(preferred)
+        if match is not None:
+            return ResolvedVisionConfig(
+                config_id=match.id,
+                provider_model=match.model_name,
+                selected_via="auto-priority",
+            )
+
+    # Fallback: first registered OpenRouter vision config. Deterministic
+    # because the backend returns them in a stable order.
+    fallback = or_vision[0]
+    return ResolvedVisionConfig(
+        config_id=fallback.id,
+        provider_model=fallback.model_name,
+        selected_via="auto-fallback",
+    )
+
+
+__all__ = [
+    "RECOMMENDED_VISION_PRIORITY",
+    "ResolvedVisionConfig",
+    "VisionConfigError",
+    "resolve_vision_llm",
+]
--- a/Show more
+++ b/Show more