Merge remote-tracking branch 'upstream/dev' into feature/multi-agent-with-task-parallelization

This commit is contained in:
CREDO23 2026-05-15 16:44:22 +02:00
commit 4980f9f1ba
193 changed files with 32777 additions and 565 deletions

View file

@ -31,7 +31,7 @@ jobs:
new_tag: ${{ steps.tag_version.outputs.next_version }}
steps:
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.branch }}
@ -108,16 +108,18 @@ jobs:
name: surfsense-backend
context: ./surfsense_backend
file: ./surfsense_backend/Dockerfile
target: production
- image: web
name: surfsense-web
context: ./surfsense_web
file: ./surfsense_web/Dockerfile
target: runner
env:
REGISTRY_IMAGE: ghcr.io/${{ github.repository_owner }}/${{ matrix.name }}
steps:
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v6
- name: Set lowercase image name
id: image
@ -125,19 +127,19 @@ jobs:
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
uses: docker/metadata-action@v6
with:
images: ${{ steps.image.outputs.name }}
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
uses: docker/login-action@v4
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v4
- name: Free up disk space
run: |
@ -149,10 +151,11 @@ jobs:
- name: Build and push by digest ${{ matrix.name }} (${{ matrix.suffix }})
id: build
uses: docker/build-push-action@v6
uses: docker/build-push-action@v7
with:
context: ${{ matrix.context }}
file: ${{ matrix.file }}
target: ${{ matrix.target }}
labels: ${{ steps.meta.outputs.labels }}
tags: ${{ steps.image.outputs.name }}
outputs: type=image,push-by-digest=true,name-canonical=true,push=true
@ -174,7 +177,7 @@ jobs:
touch "/tmp/digests/${digest#sha256:}"
- name: Upload digest
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
with:
name: digests-${{ matrix.image }}-${{ matrix.suffix }}
path: /tmp/digests/*
@ -205,22 +208,22 @@ jobs:
run: echo "name=${REGISTRY_IMAGE,,}" >> $GITHUB_OUTPUT
- name: Download amd64 digest
uses: actions/download-artifact@v4
uses: actions/download-artifact@v8
with:
name: digests-${{ matrix.image }}-amd64
path: /tmp/digests
- name: Download arm64 digest
uses: actions/download-artifact@v4
uses: actions/download-artifact@v8
with:
name: digests-${{ matrix.image }}-arm64
path: /tmp/digests
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
uses: docker/login-action@v4
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
@ -239,7 +242,7 @@ jobs:
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
uses: docker/metadata-action@v6
with:
images: ${{ steps.image.outputs.name }}
tags: |

174
.github/workflows/e2e-tests.yml vendored Normal file
View file

@ -0,0 +1,174 @@
name: E2E Tests
on:
pull_request:
branches: [main, dev]
types: [opened, synchronize, reopened, ready_for_review]
paths:
- 'surfsense_web/**'
- 'surfsense_backend/**'
- 'docker/docker-compose.e2e.yml'
- '.github/workflows/e2e-tests.yml'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e:
name: Journey
runs-on: ubuntu-latest
if: github.event.pull_request.draft == false
timeout-minutes: 30
env:
# Test user that the backend creates via /auth/register before Playwright runs.
PLAYWRIGHT_TEST_EMAIL: e2e-test@surfsense.net
PLAYWRIGHT_TEST_PASSWORD: E2eTestPassword123!
# Frontend env: Playwright's webServer (surfsense_web/playwright.config.ts)
# spawns `pnpm build && pnpm start` in CI; these get baked into the build.
NEXT_PUBLIC_FASTAPI_BACKEND_URL: http://localhost:8000
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: LOCAL
# Shared secret for the test-only POST /__e2e__/auth/token endpoint.
# Must match docker-compose.e2e.yml's backend env (x-backend-env).
E2E_MINT_SECRET: e2e-mint-secret-not-for-production
steps:
- uses: actions/checkout@v6
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v4
# ─── Backend stack ─────────────────────────────────────────────────
# Builds the e2e image (multi-stage, deps cached via GHA), brings up
# db + redis + backend + celery_worker, blocks until every healthcheck
# is green. No `uv` invocation on the runner; no PID files; no curl
# polling loops; readiness is gated by Docker healthchecks.
- name: Build & start backend stack
run: |
docker compose -f docker/docker-compose.e2e.yml \
up -d --build --wait --wait-timeout 300
- name: Show backend stack status
if: always()
run: docker compose -f docker/docker-compose.e2e.yml ps
- name: Register E2E test user
run: |
# 200/201 = created, 400 = already exists (idempotent across reruns).
STATUS=$(curl -s -o /tmp/register.json -w "%{http_code}" \
-X POST http://localhost:8000/auth/register \
-H "Content-Type: application/json" \
-d "{\"email\":\"${PLAYWRIGHT_TEST_EMAIL}\",\"password\":\"${PLAYWRIGHT_TEST_PASSWORD}\"}")
echo "Register status: ${STATUS}"
cat /tmp/register.json
if [ "${STATUS}" != "200" ] && [ "${STATUS}" != "201" ] && [ "${STATUS}" != "400" ]; then
echo "::error::Failed to register test user (status ${STATUS})"
exit 1
fi
# Flush auth rate-limit counters so Playwright starts clean.
docker compose -f docker/docker-compose.e2e.yml exec -T redis \
sh -c "redis-cli --scan --pattern 'surfsense:auth_rate_limit:*' \
| xargs -r redis-cli DEL" || true
# ─── Frontend (host-side) ──────────────────────────────────────────
# Playwright's webServer block in playwright.config.ts spawns
# `pnpm build && pnpm start` in CI mode and waits for :3000.
- uses: actions/setup-node@v6
with:
node-version: '20'
- uses: pnpm/action-setup@v6
- name: Get pnpm store directory
id: pnpm-cache
shell: bash
run: echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_OUTPUT
- name: Cache pnpm store
uses: actions/cache@v5
with:
path: ${{ steps.pnpm-cache.outputs.STORE_PATH }}
key: pnpm-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}
restore-keys: pnpm-${{ runner.os }}-
- name: Install web dependencies
working-directory: surfsense_web
run: pnpm install --frozen-lockfile
- name: Cache Playwright browsers
id: playwright-cache
uses: actions/cache@v5
with:
path: ~/.cache/ms-playwright
key: playwright-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}
- name: Install Playwright browsers
if: steps.playwright-cache.outputs.cache-hit != 'true'
working-directory: surfsense_web
run: pnpm exec playwright install --with-deps chromium
- name: Install Playwright system deps (cache hit)
if: steps.playwright-cache.outputs.cache-hit == 'true'
working-directory: surfsense_web
run: pnpm exec playwright install-deps chromium
- name: Cache Next.js build
uses: actions/cache@v5
with:
path: surfsense_web/.next/cache
key: nextjs-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}-${{ github.sha }}
restore-keys: |
nextjs-${{ runner.os }}-${{ hashFiles('surfsense_web/pnpm-lock.yaml') }}-
nextjs-${{ runner.os }}-
# ─── Tests ─────────────────────────────────────────────────────────
- name: Run Playwright tests
working-directory: surfsense_web
run: pnpm test:e2e:prod
# ─── Failure diagnostics ───────────────────────────────────────────
- name: Dump backend stack logs on failure
if: ${{ failure() || cancelled() }}
run: |
mkdir -p ./compose-logs
docker compose -f docker/docker-compose.e2e.yml logs --no-color --timestamps \
> ./compose-logs/all-services.log 2>&1 || true
for svc in db redis backend celery_worker; do
docker compose -f docker/docker-compose.e2e.yml logs --no-color --timestamps "$svc" \
> "./compose-logs/${svc}.log" 2>&1 || true
done
docker compose -f docker/docker-compose.e2e.yml ps \
> ./compose-logs/ps.txt 2>&1 || true
# ─── Artifacts ─────────────────────────────────────────────────────
- name: Upload Playwright HTML report
if: always()
uses: actions/upload-artifact@v7
with:
name: playwright-report
path: surfsense_web/playwright-report/
retention-days: 14
- name: Upload Playwright traces
if: failure()
uses: actions/upload-artifact@v7
with:
name: playwright-traces
path: surfsense_web/test-results/
retention-days: 14
- name: Upload backend stack logs
if: ${{ failure() || cancelled() }}
uses: actions/upload-artifact@v7
with:
name: backend-stack-logs
path: ./compose-logs/
retention-days: 7
# ─── Teardown ──────────────────────────────────────────────────────
- name: Tear down backend stack
if: always()
run: docker compose -f docker/docker-compose.e2e.yml down -v --remove-orphans

2
.gitignore vendored
View file

@ -17,3 +17,5 @@ surfsense_web/test-results/
surfsense_web/blob-report/
hermes-agent
hermes-agent/
content_research/

View file

@ -4,7 +4,7 @@
# Database, Redis, and internal service wiring are handled automatically.
# ==============================================================================
# SurfSense version (use "latest", a clean version like "0.0.14", or a specific build like "0.0.14.1")
# SurfSense version (use "latest" or a specific version like "0.0.14")
SURFSENSE_VERSION=latest
# ------------------------------------------------------------------------------

View file

@ -10,6 +10,11 @@
name: surfsense-dev
x-backend-build: &backend-build
context: ../surfsense_backend
args:
EMBEDDING_MODEL: ${EMBEDDING_MODEL:-sentence-transformers/all-MiniLM-L6-v2}
services:
db:
image: pgvector/pgvector:pg17
@ -69,7 +74,7 @@ services:
retries: 5
backend:
build: ../surfsense_backend
build: *backend-build
ports:
- "${BACKEND_PORT:-8000}:8000"
volumes:
@ -114,7 +119,7 @@ services:
start_period: 200s
celery_worker:
build: ../surfsense_backend
build: *backend-build
volumes:
- ../surfsense_backend/app:/app/app
- shared_temp:/shared_tmp
@ -140,7 +145,7 @@ services:
condition: service_healthy
celery_beat:
build: ../surfsense_backend
build: *backend-build
env_file:
- ../surfsense_backend/.env
environment:
@ -159,7 +164,7 @@ services:
condition: service_started
# flower:
# build: ../surfsense_backend
# build: *backend-build
# ports:
# - "${FLOWER_PORT:-5555}:5555"
# env_file:

View file

@ -0,0 +1,181 @@
# =============================================================================
# SurfSense — E2E Docker Compose stack
# =============================================================================
# Hermetic backend stack for Playwright E2E tests:
# - db / redis on an internal-only network (no internet egress)
# - backend (FastAPI) joins the internal network AND a separate ingress
# bridge so the host runner can reach :8000
# - celery_worker on the internal network only — zero egress surface
#
# The backend image is built from surfsense_backend/Dockerfile target=e2e,
# which adds tests/ via the `tests-source` additional context (tests/ is
# excluded from the main context by .dockerignore so production never ships
# test fakes). See surfsense_backend/Dockerfile for stage layout.
#
# Usage from repo root:
# docker compose -f docker/docker-compose.e2e.yml up -d --build --wait
# curl -X POST http://localhost:8000/auth/register ...
# ( run Playwright on host, pointing at localhost:8000 + localhost:3000 )
# docker compose -f docker/docker-compose.e2e.yml down -v
# =============================================================================
name: surfsense-e2e
x-backend-env: &backend-env
DATABASE_URL: postgresql+asyncpg://postgres:postgres@db:5432/surfsense_e2e
CELERY_BROKER_URL: redis://redis:6379/0
CELERY_RESULT_BACKEND: redis://redis:6379/0
REDIS_APP_URL: redis://redis:6379/0
CELERY_TASK_DEFAULT_QUEUE: surfsense
SECRET_KEY: ci-test-secret-key-not-for-production
AUTH_TYPE: LOCAL
REGISTRATION_ENABLED: "TRUE"
ETL_SERVICE: DOCLING
EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
NEXT_FRONTEND_URL: http://host.docker.internal:3000
# Sentinel keys — fakes never read them; turns leaked real calls into 401s.
COMPOSIO_API_KEY: e2e-deny-real-call-sentinel
COMPOSIO_ENABLED: "TRUE"
OPENAI_API_KEY: e2e-deny-real-call-sentinel
ANTHROPIC_API_KEY: e2e-deny-real-call-sentinel
LITELLM_API_KEY: e2e-deny-real-call-sentinel
MICROSOFT_CLIENT_ID: fake-microsoft-client-id
MICROSOFT_CLIENT_SECRET: fake-microsoft-client-secret
ONEDRIVE_REDIRECT_URI: http://localhost:8000/api/v1/auth/onedrive/connector/callback
DROPBOX_APP_KEY: fake-dropbox-app-key
DROPBOX_APP_SECRET: fake-dropbox-app-secret
DROPBOX_REDIRECT_URI: http://localhost:8000/api/v1/auth/dropbox/connector/callback
# Defense-in-depth: even though L3 egress is denied for the worker via
# `internal: true`, the backend still has a route via `ingress`. Setting
# HTTPS_PROXY to an unreachable port turns any leaked Python outbound HTTP
# call into a fast Connection refused. UNLIKE the old runner-shell setup,
# this proxy is set on the container env and `uv` is never invoked here,
# so there is no interaction with uv's implicit-sync behaviour.
HTTPS_PROXY: http://127.0.0.1:1
HTTP_PROXY: http://127.0.0.1:1
NO_PROXY: localhost,127.0.0.1,0.0.0.0,db,redis,host.docker.internal
HF_HUB_OFFLINE: "1"
TRANSFORMERS_OFFLINE: "1"
# Test-only token-mint endpoint secret (see tests/e2e/run_backend.py).
E2E_MINT_SECRET: e2e-mint-secret-not-for-production
services:
db:
image: pgvector/pgvector:pg17
command: >
postgres
-c wal_level=logical
-c max_wal_senders=10
-c max_replication_slots=10
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: surfsense_e2e
# Ephemeral storage — every CI run gets a clean DB, no volume cleanup needed.
tmpfs:
- /var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres -d surfsense_e2e"]
interval: 2s
timeout: 3s
retries: 30
networks: [internal]
redis:
image: redis:8-alpine
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 2s
timeout: 3s
retries: 30
networks: [internal]
backend:
build:
context: ../surfsense_backend
dockerfile: Dockerfile
target: e2e
additional_contexts:
# tests/ is excluded from the main context by .dockerignore;
# the e2e stage's `COPY --from=tests-source` pulls it in here.
tests-source: ../surfsense_backend/tests
args:
EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
cache_from:
- type=gha,scope=surfsense-e2e-backend
cache_to:
- type=gha,mode=max,scope=surfsense-e2e-backend
image: surfsense-e2e-backend:local
environment:
<<: *backend-env
SERVICE_ROLE: api
volumes:
- shared_temp:/shared_tmp
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- "8000:8000"
depends_on:
db: { condition: service_healthy }
redis: { condition: service_healthy }
healthcheck:
# Use Python (already in the image) instead of curl/wget to avoid
# depending on either tool being installed in the runtime layers.
test:
- CMD
- python
- -c
- |
import sys, urllib.request
try:
r = urllib.request.urlopen("http://localhost:8000/openapi.json", timeout=2)
sys.exit(0 if r.status == 200 else 1)
except Exception:
sys.exit(1)
interval: 3s
timeout: 5s
retries: 60
start_period: 30s
networks:
- internal # to reach db/redis
- ingress # so host can reach :8000
celery_worker:
image: surfsense-e2e-backend:local
pull_policy: never
# No build: section — reuses the image built by the `backend` service.
# Compose v2 builds shared images exactly once across services that
# reference the same `image:` tag.
environment:
<<: *backend-env
SERVICE_ROLE: worker
volumes:
- shared_temp:/shared_tmp
depends_on:
backend: { condition: service_healthy }
healthcheck:
test:
- CMD-SHELL
- "celery -A app.celery_app inspect ping --timeout 2 | grep -q pong"
interval: 5s
timeout: 5s
retries: 12
start_period: 20s
networks: [internal]
networks:
# Internal network: containers attached only to this network have NO route
# to the host or the internet. This is the L3 deny-egress mechanism that
# replaces the fragile HTTPS_PROXY-on-the-runner approach.
internal:
driver: bridge
internal: true
# Regular bridge network. Only the `backend` service joins it, solely so
# the host can reach :8000 via the published port. celery_worker / db /
# redis stay off this network entirely.
ingress:
driver: bridge
volumes:
shared_temp:

View file

@ -1,5 +1,5 @@
{
"name": "surfsense",
"private": true,
"packageManager": "pnpm@10.24.0"
"packageManager": "pnpm@10.26.0"
}

View file

@ -13,5 +13,5 @@ celerybeat-schedule*
celerybeat-schedule.*
celerybeat-schedule.dir
celerybeat-schedule.bak
global_llm_config.yaml
/app/config/global_llm_config.yaml
app/templates/_generated/

View file

@ -1,8 +1,16 @@
FROM python:3.12-slim
# =============================================================================
# SurfSense Backend — Multi-stage Dockerfile
# =============================================================================
# Graph: base → deps → models → {e2e, production}
# e2e — tests/ via additional_contexts (docker-compose.e2e.yml)
# production — published ghcr.io image (docker-build.yml pins target)
# =============================================================================
# ─── Stage 1: base (system deps, Pandoc, certificates) ──────────────────────
FROM python:3.12-slim AS base
WORKDIR /app
# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
python3-dev \
@ -11,6 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
unzip \
gnupg2 \
ffmpeg \
espeak-ng \
libsndfile1 \
libgl1 \
@ -22,21 +31,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
git \
&& rm -rf /var/lib/apt/lists/*
# Install Pandoc 3.x from GitHub as a fallback for Linux where pypandoc_binary
# may not bundle pandoc (apt ships 2.17 which has broken table rendering).
# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks this up.
RUN which ffmpeg && ffmpeg -version
# Pandoc 3.x from GitHub Releases — apt ships 2.17 which has broken table rendering.
# pypandoc_binary bundles pandoc on Windows/macOS; on Linux it picks up this binary.
RUN ARCH=$(dpkg --print-architecture) && \
wget -qO /tmp/pandoc.deb "https://github.com/jgm/pandoc/releases/download/3.9/pandoc-3.9-1-${ARCH}.deb" && \
dpkg -i /tmp/pandoc.deb && \
rm /tmp/pandoc.deb
# Update certificates and install SSL tools
RUN update-ca-certificates
RUN pip install --upgrade certifi pip-system-certs
# Copy requirements
COPY pyproject.toml .
COPY uv.lock .
ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
ENV SURFSENSE_ALLOW_STATIC_FFMPEG_DOWNLOAD=FALSE
# ─── Stage 2: deps (Python deps frozen from uv.lock) ────────────────────────
FROM base AS deps
COPY pyproject.toml uv.lock ./
# Install all Python dependencies from uv.lock for deterministic builds.
#
@ -49,9 +64,7 @@ COPY uv.lock .
# Note on torch/CUDA: we do NOT install torch from a separate cu* index here.
# PyPI's torch wheels for Linux x86_64 already ship CUDA-enabled and pull
# nvidia-cudnn-cu13, nvidia-nccl-cu13, triton, etc. as install deps (all
# captured in uv.lock). Installing from cu121 first only wasted ~2GB of
# downloads that the lock-based install immediately replaced. If a specific
# CUDA version is needed (driver compatibility, etc.), wire it through
# captured in uv.lock). If a specific CUDA version is needed, wire it through
# [tool.uv.sources] in pyproject.toml so the lock stays the source of truth.
RUN pip install --no-cache-dir uv && \
uv export --frozen --no-dev --no-hashes --no-emit-project \
@ -59,49 +72,42 @@ RUN pip install --no-cache-dir uv && \
uv pip install --system --no-cache-dir -r /tmp/requirements.txt && \
rm /tmp/requirements.txt
# Set SSL environment variables dynamically
RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
# ─── Stage 3: models (pre-baked offline assets) ─────────────────────────────
FROM deps AS models
# Pre-download EasyOCR models to avoid runtime SSL issues
RUN mkdir -p /root/.EasyOCR/model
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
RUN mkdir -p /root/.EasyOCR/model && \
wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true && \
wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true && \
cd /root/.EasyOCR/model && \
(unzip -o english_g2.zip || true) && \
(unzip -o craft_mlt_25k.zip || true)
# Pre-download Docling models
RUN python -c "try:\n from docling.document_converter import DocumentConverter\n conv = DocumentConverter()\nexcept:\n pass" || true
RUN printf '%s\n' \
'try:' \
' from docling.document_converter import DocumentConverter' \
' DocumentConverter()' \
'except Exception:' \
' pass' \
| python || true
# Install Playwright browsers for web scraping (the playwright package itself
# is already installed via uv.lock above)
ARG EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
RUN python -c "from chonkie import AutoEmbeddings; AutoEmbeddings.get_embeddings('${EMBEDDING_MODEL}')"
# Install Playwright browsers (the playwright python package itself is in deps)
RUN playwright install chromium --with-deps
# Copy source code
COPY . .
# Install the project itself in editable mode. Dependencies were already
# installed deterministically from uv.lock above, so --no-deps prevents any
# re-resolution that could pull newer versions.
RUN uv pip install --system --no-cache-dir --no-deps -e .
# Copy and set permissions for entrypoint script
# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
# Shared temp directory for file uploads between API and Worker containers.
# Python's tempfile module uses TMPDIR, so uploaded files land here.
# Mount the SAME volume at /shared_tmp on both API and Worker in Coolify.
RUN mkdir -p /shared_tmp
ENV TMPDIR=/shared_tmp
# Prevent uvloop compatibility issues
ENV PYTHONPATH=/app
ENV UVICORN_LOOP=asyncio
ENV TMPDIR=/shared_tmp
ENV PYTHONUNBUFFERED=1
# Tune glibc malloc to return freed memory to the OS more aggressively.
# Without these, Python's gc.collect() frees objects but the underlying
@ -110,6 +116,56 @@ ENV MALLOC_MMAP_THRESHOLD_=65536
ENV MALLOC_TRIM_THRESHOLD_=131072
ENV MALLOC_MMAP_MAX_=65536
# ─── Stage 4: e2e (production source + tests/ + e2e entrypoint) ─────────────
# Built via `docker buildx build --target e2e`. The default build target is
# `production` (the last stage), so this stage is opt-in for CI only.
#
# `tests/` is excluded from the main build context by .dockerignore (so prod
# can never accidentally ship test fakes). The e2e stage receives tests/
# through an "additional context" passed by docker-compose.e2e.yml — see
# https://docs.docker.com/reference/compose-file/build/#additional_contexts
FROM models AS e2e
# Same source copy as production. .dockerignore filters out tests/.
COPY . .
# Bring tests/ in via the named additional build context. CI passes
# --build-context tests-source=./tests
# (or the equivalent additional_contexts entry in docker-compose.e2e.yml).
COPY --from=tests-source . ./tests/
# Install the project itself in editable mode. Dependencies were already
# installed deterministically from uv.lock above, so --no-deps prevents any
# re-resolution that could pull newer versions.
RUN uv pip install --system --no-cache-dir --no-deps -e .
COPY scripts/docker/entrypoint.e2e.sh /app/scripts/docker/entrypoint.e2e.sh
RUN dos2unix /app/scripts/docker/entrypoint.e2e.sh && chmod +x /app/scripts/docker/entrypoint.e2e.sh
# SERVICE_ROLE is overridden per service in docker-compose.e2e.yml (api / worker).
ENV SERVICE_ROLE=api
EXPOSE 8000-8001
CMD ["/app/scripts/docker/entrypoint.e2e.sh"]
# ─── Stage 5: production (published ghcr.io image) ──────────────────────────
# CI pins `target: production`; also the default for `docker build` / dev compose.
FROM models AS production
# Copy source code (tests/ excluded by .dockerignore — production never ships tests).
COPY . .
# Install the project itself in editable mode. Dependencies were already
# installed deterministically from uv.lock above, so --no-deps prevents any
# re-resolution that could pull newer versions.
RUN uv pip install --system --no-cache-dir --no-deps -e .
# Use dos2unix to ensure LF line endings (fixes CRLF issues from Windows checkouts)
COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
# SERVICE_ROLE controls which process this container runs:
# api FastAPI backend only (runs migrations on startup)
# worker Celery worker only
@ -127,6 +183,5 @@ ENV CELERY_MAX_TASKS_PER_CHILD=50
# "" both queues (default, for single-worker setups)
ENV CELERY_QUEUES=""
# Run
EXPOSE 8000-8001
CMD ["/app/scripts/docker/entrypoint.sh"]
CMD ["/app/scripts/docker/entrypoint.sh"]

View file

@ -67,7 +67,11 @@ def run_migrations_offline() -> None:
def do_run_migrations(connection: Connection) -> None:
context.configure(connection=connection, target_metadata=target_metadata)
context.configure(
connection=connection,
target_metadata=target_metadata,
transaction_per_migration=True,
)
with context.begin_transaction():
context.run_migrations()

View file

@ -26,6 +26,10 @@ depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
bind = op.get_bind()
if sa.inspect(bind).has_table("agent_action_log"):
return
op.create_table(
"agent_action_log",
sa.Column("id", sa.Integer(), primary_key=True, index=True),

View file

@ -29,6 +29,21 @@ depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
bind = op.get_bind()
inspector = sa.inspect(bind)
if inspector.has_table("document_revisions") and inspector.has_table(
"folder_revisions"
):
return
if not inspector.has_table("document_revisions"):
_create_document_revisions()
if not inspector.has_table("folder_revisions"):
_create_folder_revisions()
def _create_document_revisions() -> None:
op.create_table(
"document_revisions",
sa.Column("id", sa.Integer(), primary_key=True, index=True),
@ -74,6 +89,8 @@ def upgrade() -> None:
),
)
def _create_folder_revisions() -> None:
op.create_table(
"folder_revisions",
sa.Column("id", sa.Integer(), primary_key=True, index=True),

View file

@ -26,6 +26,10 @@ depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
bind = op.get_bind()
if sa.inspect(bind).has_table("agent_permission_rules"):
return
op.create_table(
"agent_permission_rules",
sa.Column("id", sa.Integer(), primary_key=True, index=True),

View file

@ -50,29 +50,39 @@ depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.add_column(
"agent_action_log",
sa.Column("tool_call_id", sa.String(length=64), nullable=True),
)
op.add_column(
"agent_action_log",
sa.Column("chat_turn_id", sa.String(length=64), nullable=True),
)
bind = op.get_bind()
inspector = sa.inspect(bind)
columns = {c["name"] for c in inspector.get_columns("agent_action_log")}
indexes = {i["name"] for i in inspector.get_indexes("agent_action_log")}
op.create_index(
"ix_agent_action_log_tool_call_id",
"agent_action_log",
["tool_call_id"],
)
op.create_index(
"ix_agent_action_log_chat_turn_id",
"agent_action_log",
["chat_turn_id"],
)
if "tool_call_id" not in columns:
op.add_column(
"agent_action_log",
sa.Column("tool_call_id", sa.String(length=64), nullable=True),
)
if "chat_turn_id" not in columns:
op.add_column(
"agent_action_log",
sa.Column("chat_turn_id", sa.String(length=64), nullable=True),
)
op.execute(
"UPDATE agent_action_log SET tool_call_id = turn_id WHERE tool_call_id IS NULL"
)
if "ix_agent_action_log_tool_call_id" not in indexes:
op.create_index(
"ix_agent_action_log_tool_call_id",
"agent_action_log",
["tool_call_id"],
)
if "ix_agent_action_log_chat_turn_id" not in indexes:
op.create_index(
"ix_agent_action_log_chat_turn_id",
"agent_action_log",
["chat_turn_id"],
)
if "turn_id" in columns:
op.execute(
"UPDATE agent_action_log SET tool_call_id = turn_id WHERE tool_call_id IS NULL"
)
def downgrade() -> None:

View file

@ -36,15 +36,22 @@ depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.add_column(
"new_chat_messages",
sa.Column("turn_id", sa.String(length=64), nullable=True),
)
op.create_index(
"ix_new_chat_messages_turn_id",
"new_chat_messages",
["turn_id"],
)
bind = op.get_bind()
inspector = sa.inspect(bind)
columns = {c["name"] for c in inspector.get_columns("new_chat_messages")}
indexes = {i["name"] for i in inspector.get_indexes("new_chat_messages")}
if "turn_id" not in columns:
op.add_column(
"new_chat_messages",
sa.Column("turn_id", sa.String(length=64), nullable=True),
)
if "ix_new_chat_messages_turn_id" not in indexes:
op.create_index(
"ix_new_chat_messages_turn_id",
"new_chat_messages",
["turn_id"],
)
def downgrade() -> None:

View file

@ -27,6 +27,8 @@ from __future__ import annotations
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "137"
@ -39,6 +41,11 @@ _INDEX_NAME = "ux_agent_action_log_reverse_of"
def upgrade() -> None:
bind = op.get_bind()
indexes = {i["name"] for i in sa.inspect(bind).get_indexes("agent_action_log")}
if _INDEX_NAME in indexes:
return
# Defensively de-dup any pre-existing double-revert rows before
# adding the unique index. Keeps the OLDEST row (smallest id) and
# NULLs out the duplicates' ``reverse_of`` so they survive as audit

View file

@ -53,6 +53,11 @@ TABLE_NAME = "new_chat_messages"
def upgrade() -> None:
bind = op.get_bind()
indexes = {i["name"] for i in sa.inspect(bind).get_indexes(TABLE_NAME)}
if INDEX_NAME in indexes:
return
op.create_index(
INDEX_NAME,
TABLE_NAME,

View file

@ -473,10 +473,16 @@ def initialize_vision_llm_router():
class Config:
# Check if ffmpeg is installed
if not is_ffmpeg_installed():
import static_ffmpeg
allow_static_ffmpeg = (
os.getenv("SURFSENSE_ALLOW_STATIC_FFMPEG_DOWNLOAD", "TRUE").upper()
== "TRUE"
)
if allow_static_ffmpeg:
import static_ffmpeg
# ffmpeg installed on first call to add_paths(), threadsafe.
static_ffmpeg.add_paths()
# ffmpeg installed on first call to add_paths(), threadsafe.
static_ffmpeg.add_paths()
# check if ffmpeg is installed again
if not is_ffmpeg_installed():
raise ValueError(

View file

@ -134,12 +134,92 @@ class EtlPipelineService:
else:
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
# When the operator opts into vision-LLM at ingest, walk the
# original file's embedded images and append a structured
# "Image Content" section. The parser's own OCR (Docling
# do_ocr=True, Azure DI prebuilt-read, etc.) handles text-in-
# image; this side handles the *visual* description which the
# parsers all drop today.
content = await self._maybe_append_picture_descriptions(request, content)
return EtlResult(
markdown_content=content,
etl_service=etl_service,
content_type="document",
)
async def _maybe_append_picture_descriptions(
self, request: EtlRequest, markdown: str
) -> str:
if self._vision_llm is None:
return markdown
from app.etl_pipeline.picture_describer import (
describe_pictures,
merge_descriptions_into_markdown,
)
# Per-image OCR runner: re-feed each extracted image through
# the ETL pipeline *as a standalone image* (no vision LLM, so
# the IMAGE branch falls through to the document parser, which
# OCRs the image with the configured backend -- Docling /
# Azure DI / LlamaCloud). This gives us per-image OCR text
# attached to the inline image block, in addition to the
# page-level OCR that the parser already merges into the main
# markdown stream. The fresh sub-service gets vision_llm=None
# so this call cannot recurse back into picture_describer.
async def _ocr_image(image_path: str, image_name: str) -> str:
try:
sub = EtlPipelineService(vision_llm=None)
ocr_result = await sub.extract(
EtlRequest(file_path=image_path, filename=image_name)
)
except (
EtlUnsupportedFileError,
EtlServiceUnavailableError,
) as exc:
# Common case: the configured ETL service can't OCR
# this image format (or no service is configured at
# all). Don't spam warnings -- just no OCR for it.
logging.debug(
"Skipping per-image OCR for %s: %s", image_name, exc
)
return ""
return ocr_result.markdown_content
try:
result = await describe_pictures(
request.file_path,
request.filename,
self._vision_llm,
ocr_runner=_ocr_image,
)
except Exception:
# Picture description is additive; never let it fail an
# otherwise-successful document extraction.
logging.warning(
"Picture description failed for %s, returning parser output unchanged",
request.filename,
exc_info=True,
)
return markdown
if not result.descriptions:
return markdown
merged = merge_descriptions_into_markdown(markdown, result)
logging.info(
"Vision LLM described %d image(s) in %s "
"(skipped: %d small / %d large / %d duplicate, %d failed)",
len(result.descriptions),
request.filename,
result.skipped_too_small,
result.skipped_too_large,
result.skipped_duplicate,
result.failed,
)
return merged
async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
"""Try Azure Document Intelligence first (when configured) then LlamaCloud.

View file

@ -4,12 +4,34 @@ import os
from langchain_core.messages import HumanMessage
# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
# A standalone image IS the document, so we want everything: visual
# content plus any text the model can read off it. The output is
# combined markdown that the chunker treats as the full document body.
_PROMPT = (
"Describe this image in markdown. "
"Transcribe any visible text verbatim. "
"Be concise but complete — let the image content guide the level of detail."
)
# Per-image-in-PDF prompt. Here the image is *inside* a larger
# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
# already running OCR over the whole page — including text rendered
# into images. So we explicitly tell the model NOT to transcribe text
# and to focus only on visual interpretation. This avoids paying
# output tokens for OCR content the ETL pipeline already captured.
_DESCRIPTION_PROMPT = (
"Describe what this image visually depicts in concise markdown. "
"Focus on visual content — anatomy, structures, charts, diagrams, "
"spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
"histology slide), and any clinically or structurally relevant "
"findings.\n\n"
"Do NOT transcribe text from the image. Any text in the image "
"(axis labels, annotations, scale bars, lab values, etc.) is "
"already extracted by a separate OCR pipeline; duplicating it "
"here would be redundant. Stick to the visual interpretation."
)
_MAX_IMAGE_BYTES = (
5 * 1024 * 1024
) # 5 MB (Anthropic Claude's limit, the most restrictive)
@ -47,11 +69,10 @@ def _image_to_data_url(file_path: str) -> str:
return f"data:{mime_type};base64,{encoded}"
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
data_url = _image_to_data_url(file_path)
async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
message = HumanMessage(
content=[
{"type": "text", "text": _PROMPT},
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": data_url}},
]
)
@ -62,3 +83,36 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
if not text or not text.strip():
raise ValueError(f"Vision LLM returned empty content for {filename}")
return text.strip()
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
"""Single-shot: returns combined markdown for a standalone image upload.
Used when the operator uploads an image file directly (jpg/png/etc).
The image is the document, so the prompt asks for both visual
description and verbatim text in one go.
"""
data_url = _image_to_data_url(file_path)
return await _invoke_vision(llm, _PROMPT, data_url, filename)
async def parse_image_for_description(
file_path: str, filename: str, llm
) -> str:
"""Visual-description-only call for per-image-in-PDF use.
Used by ``picture_describer`` when an image is embedded inside a
larger document. Returns a markdown description of what the image
visually depicts; deliberately does NOT include text-in-image OCR
because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
already running OCR over the entire page and would duplicate that
text content.
"""
data_url = _image_to_data_url(file_path)
return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
__all__ = [
"parse_image_for_description",
"parse_with_vision_llm",
]

View file

@ -0,0 +1,678 @@
"""Extract embedded images from PDFs, describe them, and inject the
descriptions inline into the parser's markdown.
When the operator passes ``use_vision_llm=True`` for a PDF, the document
parsers (DOCLING / LLAMACLOUD / Azure DI / UNSTRUCTURED) extract text
but mostly drop the actual image content -- a CT scan inside a clinical
PDF becomes (at best) a ``<!-- image -->`` placeholder in the markdown,
and the caption text below it.
This module fills that gap. After the document parser produces markdown
text, we:
1. Walk the original PDF with :mod:`pypdf`, pulling out each embedded
image (deduped by sha256, size-capped to match the vision LLM's own
limits).
2. Run the vision LLM on each unique image (visual description) and,
in parallel when an OCR runner is provided, re-feed the same image
through the ETL service for per-image OCR.
3. **Inject** a horizontal-rule-delimited markdown section -- with
named "OCR text" and "Visual description" sub-sections -- where the
image actually appears in the parser's markdown. Two splice modes,
chosen by which marker the parser emitted:
- **Replace** Docling-style ``<!-- image -->`` placeholders (and an
optional ``Image: <filename>`` caption line). The placeholder
carries no useful content of its own, so we substitute our block
for it.
- **Append after** layout-aware ``<figure>...</figure>`` blocks
(Azure DI ``prebuilt-layout``, LlamaCloud premium). Those blocks
already contain parser-extracted chart values / OCR'd labels /
captions, which are themselves useful for retrieval -- so we
PRESERVE the figure verbatim and add our vision-LLM block
immediately after it. The chunk then contains both the parser's
structured numbers AND the VLM's semantic interpretation.
Either way, the image content stays in context with the surrounding
document body rather than getting orphaned at the end -- crucial for
retrieval, where a single chunk should contain the question, the
image content, and the answer options together.
If no placeholders, figures, or captions can be matched (e.g. an
unusual parser output), we fall back to appending an
``## Image Content`` section so no image content is silently lost.
"""
from __future__ import annotations
import asyncio
import contextlib
import hashlib
import logging
import re
import tempfile
from collections.abc import Awaitable, Callable
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
# Type alias for the OCR callback. Takes (file_path, filename), returns
# the OCR'd markdown text -- or empty string if no text was found, or
# raises if OCR failed unrecoverably (which the describer catches and
# treats as "no OCR for this image" rather than failing the whole doc).
OcrRunner = Callable[[str, str], Awaitable[str]]
logger = logging.getLogger(__name__)
# Bound how many vision LLM calls we make in parallel for a single
# document. Vision models are typically rate-limited; 4 concurrent
# calls is a safe default that respects most provider limits while
# keeping wall-clock manageable for image-heavy PDFs.
_VISION_CONCURRENCY = 4
# Match parse_with_vision_llm's per-image cap so we don't even attempt
# images that the vision LLM would reject anyway (Anthropic's 5 MB
# limit is the most restrictive among the major providers).
_MAX_IMAGE_BYTES = 5 * 1024 * 1024
# Skip degenerate images: tracking pixels, very small decorative dots,
# scanner-introduced artefacts. We can't cheaply check pixel dimensions
# without decoding the image, so we approximate: anything under 1 KB is
# almost certainly not informative content.
_MIN_IMAGE_BYTES = 1024
@dataclass
class PictureDescription:
"""A single extracted image with its visual description and (optionally) OCR.
Two content fields by design, each produced by the *right* tool:
- ``description``: the vision LLM's visual interpretation. What the
image depicts (anatomy, charts, layout, etc.) -- the semantic
content that only a vision model can produce.
- ``ocr_text``: text-in-image extracted by re-feeding the image
through the configured ETL service (Docling/Azure DI/LlamaCloud)
*as if it were a standalone image upload*. Specialist OCR engine,
per-image attribution, no vision LLM tokens spent on text. None
when no OCR was requested or OCR found no text.
"""
page_number: int # 1-indexed
ordinal_in_page: int # 0-indexed within the page
name: str # name pypdf assigned (e.g. "Im0")
sha256: str # hash of the raw image bytes
description: str # visual description (markdown)
ocr_text: str | None = None # OCR text from the ETL service, if any
@dataclass
class PictureExtractionResult:
"""Aggregate result of extracting all pictures from a document."""
descriptions: list[PictureDescription] = field(default_factory=list)
skipped_too_small: int = 0
skipped_too_large: int = 0
skipped_duplicate: int = 0
failed: int = 0
@property
def has_content(self) -> bool:
return bool(self.descriptions)
def _is_pdf(filename: str) -> bool:
return filename.lower().endswith(".pdf")
def _pick_suffix(name: str) -> str:
lower = name.lower()
for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp"):
if lower.endswith(ext):
return ".jpeg" if ext == ".jpg" else ext
return ".png"
def _extract_pdf_images(file_path: str) -> list[tuple[int, int, str, bytes]]:
"""Pull every embedded image out of a PDF.
Returns ``(page_number_1_indexed, ordinal_in_page, name, bytes)``.
Per-page and per-image failures are logged and skipped -- one bad
image must not fail the whole document.
"""
from pypdf import PdfReader
out: list[tuple[int, int, str, bytes]] = []
try:
reader = PdfReader(file_path)
except Exception:
logger.warning(
"pypdf failed to open %s for image extraction",
file_path,
exc_info=True,
)
return out
for page_idx, page in enumerate(reader.pages):
try:
images = list(page.images)
except Exception:
logger.warning(
"pypdf failed to enumerate images on page %d of %s",
page_idx + 1,
file_path,
exc_info=True,
)
continue
for img_idx, img in enumerate(images):
try:
name = getattr(img, "name", None) or f"page{page_idx + 1}_img{img_idx}"
data = img.data
except Exception:
logger.warning(
"pypdf failed to read image %d on page %d of %s",
img_idx,
page_idx + 1,
file_path,
exc_info=True,
)
continue
out.append((page_idx + 1, img_idx, name, data))
return out
async def _describe_one(
page_number: int,
ordinal: int,
name: str,
sha256: str,
data: bytes,
vision_llm: Any,
semaphore: asyncio.Semaphore,
ocr_runner: OcrRunner | None,
) -> PictureDescription | None:
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
suffix = _pick_suffix(name)
# NamedTemporaryFile + delete=False because the vision-LLM helper
# and the OCR runner each open the path themselves; we clean up in
# the finally. Same temp file feeds both, which is correct: vision
# LLM and OCR are looking at the same image, just asking different
# questions of it.
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(data)
tmp_path = tmp.name
try:
async with semaphore:
tasks: list[Awaitable[Any]] = [
parse_image_for_description(tmp_path, name, vision_llm),
]
if ocr_runner is not None:
tasks.append(ocr_runner(tmp_path, name))
# return_exceptions=True so a failure in one branch (most
# often OCR) doesn't poison the other.
results = await asyncio.gather(*tasks, return_exceptions=True)
description_result = results[0]
if isinstance(description_result, BaseException):
logger.warning(
"Vision LLM failed for image %s on page %d, skipping",
name,
page_number,
exc_info=description_result,
)
return None
description = str(description_result)
ocr_text: str | None = None
if ocr_runner is not None and len(results) > 1:
ocr_result = results[1]
if isinstance(ocr_result, BaseException):
logger.warning(
"Per-image OCR failed for image %s on page %d, "
"omitting OCR field for this image",
name,
page_number,
exc_info=ocr_result,
)
else:
stripped = str(ocr_result).strip()
# Empty OCR (or whitespace-only) means the OCR engine
# found no text in this image. Record that as None so
# the rendered block doesn't include a useless empty tag.
ocr_text = stripped or None
finally:
with contextlib.suppress(OSError):
Path(tmp_path).unlink()
return PictureDescription(
page_number=page_number,
ordinal_in_page=ordinal,
name=name,
sha256=sha256,
description=description,
ocr_text=ocr_text,
)
async def describe_pictures(
file_path: str,
filename: str,
vision_llm: Any,
*,
ocr_runner: OcrRunner | None = None,
) -> PictureExtractionResult:
"""Extract embedded images from a document and describe each via vision LLM.
When ``ocr_runner`` is provided, each image is also passed to it
(in parallel with the vision LLM) and the returned text is recorded
in :attr:`PictureDescription.ocr_text`. The runner is typically a
closure over a vision-LLM-less ``EtlPipelineService`` -- this lets
the same OCR engine that processes standalone image uploads
(Docling/Azure DI/LlamaCloud) also process embedded-in-PDF images,
giving per-image OCR attribution alongside the page-level OCR that
the parser already does.
Currently PDF-only. For non-PDF documents this returns an empty
result and the caller should leave the parser's markdown untouched.
"""
result = PictureExtractionResult()
if not _is_pdf(filename) or vision_llm is None:
return result
raw_images = _extract_pdf_images(file_path)
if not raw_images:
return result
seen_hashes: set[str] = set()
eligible: list[tuple[int, int, str, str, bytes]] = []
for page_number, ordinal, name, data in raw_images:
if len(data) > _MAX_IMAGE_BYTES:
result.skipped_too_large += 1
continue
if len(data) < _MIN_IMAGE_BYTES:
result.skipped_too_small += 1
continue
sha = hashlib.sha256(data).hexdigest()
if sha in seen_hashes:
result.skipped_duplicate += 1
continue
seen_hashes.add(sha)
eligible.append((page_number, ordinal, name, sha, data))
if not eligible:
return result
semaphore = asyncio.Semaphore(_VISION_CONCURRENCY)
tasks = [
_describe_one(p, o, n, sha, d, vision_llm, semaphore, ocr_runner)
for (p, o, n, sha, d) in eligible
]
descriptions = await asyncio.gather(*tasks)
for desc in descriptions:
if desc is None:
result.failed += 1
else:
result.descriptions.append(desc)
return result
# ---------------------------------------------------------------------------
# Rendering: build the per-image markdown block + inject inline.
# ---------------------------------------------------------------------------
def _format_image_block(
name: str,
description: str,
ocr_text: str | None = None,
) -> str:
"""Render the per-image block as a horizontal-rule-delimited section.
Why no blockquote / no raw HTML / no XML?
-----------------------------------------
We tried each in turn and each failed in the document viewer:
- **Raw HTML / XML** (``<image>...</image>``): unknown elements
have no render rules in Streamdown or PlateJS, so the content
survives in the markdown source but is invisible to humans.
- **Blockquote with nested blocks**: nested fenced code blocks,
bullet lists, numbered lists, tables -- any *block* element
inside a ``>``-prefixed blockquote -- gets evicted by Streamdown
/ remark, dropping everything after it onto the document level.
The vision LLM happily produces bulleted descriptions, so this
hit the viewer in practice.
A horizontal-rule-delimited section, by contrast, contains only
standard top-level markdown -- bold labels and free-form body --
so the description's native markdown (lists, prose, tables) all
renders natively in every renderer.
Layout (OCR section omitted when ``ocr_text`` is None/empty):
---
**Embedded image:** `MM-130-a.jpeg`
**OCR text:**
Slice 24 / 60
L
R
**Visual description:**
- Axial contrast-enhanced CT showing a large cystic mass...
- Mass effect on the adjacent stomach.
---
Still LLM-friendly: the ``**Embedded image:** `<filename>``` prefix
is unique and trivially regex-able (``^\\*\\*Embedded image:\\*\\* `(.+?)`$``).
Returned with leading and trailing blank-line padding so the rules
never merge with adjacent paragraphs after splicing.
"""
parts: list[str] = [f"**Embedded image:** `{name}`"]
if ocr_text and ocr_text.strip():
# Bold "OCR text:" label with trailing two spaces (=> <br>) so
# the first OCR line sits directly under the label rather than
# forcing a paragraph break that some renderers would style
# differently. Subsequent OCR lines also use trailing two spaces
# for hard breaks, so multi-line OCR renders line-by-line
# without needing a (fragile) fenced code block.
ocr_clean_lines = [
ln.rstrip() for ln in ocr_text.strip().splitlines() if ln.strip()
]
parts.append("")
parts.append("**OCR text:** ")
for i, raw in enumerate(ocr_clean_lines):
suffix = "" if i == len(ocr_clean_lines) - 1 else " "
parts.append(f"{raw}{suffix}")
parts.append("")
parts.append("**Visual description:**")
parts.append("")
parts.append(description.strip())
body = "\n".join(parts)
# Wrap with blank lines + horizontal rules so the block is clearly
# delimited from surrounding paragraphs and survives splicing into
# the middle of any markdown stream.
return "\n\n---\n\n" + body + "\n\n---\n\n"
# Patterns we'll try to splice into. Each pattern captures the
# original-PDF filename when one is available (group 1).
#
# Replace-style markers (the matched span is substituted with our block
# because it carries no useful content of its own):
#
# 1. Docling's image placeholder followed by an "Image: <filename>"
# caption line. This is what our medxpertqa renderer produces:
# reportlab places the JPEG, then a caption, and Docling outputs
# the placeholder + caption.
# 2. Docling's image placeholder alone (filename unknown -- we fall
# back to pypdf's name).
# 3. A bare "Image: <filename>" caption line with no preceding
# placeholder. Rare in practice, but covers parsers that drop the
# placeholder entirely.
_PLACEHOLDER_WITH_CAPTION = re.compile(
r"<!--\s*image\s*-->\s*\n\s*Image:\s*(\S+)\s*(?:\n|$)",
re.IGNORECASE,
)
_PLACEHOLDER_ONLY = re.compile(
r"<!--\s*image\s*-->",
re.IGNORECASE,
)
_CAPTION_ONLY = re.compile(
r"^[ \t]*Image:\s*(\S+)\s*$",
re.IGNORECASE | re.MULTILINE,
)
# Append-after marker (the matched span is preserved verbatim and our
# block is inserted immediately after it):
#
# 4. ``<figure>...</figure>`` as emitted by layout-aware parsers (Azure
# Document Intelligence ``prebuilt-layout``, LlamaCloud premium).
# The figure's own contents -- chart bar values, axis labels,
# inline ``<figcaption>``, embedded ``<table>`` for tabular figures
# -- are themselves specialist OCR output, so we keep them and add
# our vision-LLM block alongside. ``[^>]*`` in the open tag tolerates
# optional attributes like ``<figure id="...">``; ``re.DOTALL``
# lets ``.`` cross the newlines inside the block.
_FIGURE_BLOCK = re.compile(
r"<figure\b[^>]*>.*?</figure>",
re.DOTALL | re.IGNORECASE,
)
def _replace_one_match(
markdown: str,
pattern: re.Pattern[str],
descriptions: list[PictureDescription],
desc_idx: int,
) -> tuple[str, int]:
"""Replace the first occurrence of ``pattern`` with the next image block.
Returns the new markdown and the new ``desc_idx`` (advanced if a
replacement happened, unchanged otherwise).
"""
if desc_idx >= len(descriptions):
return markdown, desc_idx
match = pattern.search(markdown)
if not match:
return markdown, desc_idx
desc = descriptions[desc_idx]
captured_name: str | None = None
if match.groups():
captured_name = match.group(1)
name = captured_name or desc.name
block = _format_image_block(name, desc.description, desc.ocr_text)
new_markdown = markdown[: match.start()] + block + markdown[match.end():]
return new_markdown, desc_idx + 1
def _splice_after_figures(
markdown: str,
descriptions: list[PictureDescription],
desc_idx: int,
) -> tuple[str, int]:
"""Append vision-LLM blocks immediately after each ``<figure>...</figure>``.
Layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
premium) wrap each figure / chart / inline table in this tag and
carry their own OCR of the figure's text content inside it. That
content is useful on its own, so we keep the original block
verbatim and add our vision-LLM block right after it -- giving
retrieval both signals in the same chunk.
Descriptions are matched to figures in document order (first
description -> first figure, etc.). All splice points are computed
upfront with :func:`re.finditer` and applied in REVERSE order so
earlier offsets stay valid as the markdown grows. Returns the
advanced ``desc_idx`` for the caller's leftover-handling.
"""
if desc_idx >= len(descriptions):
return markdown, desc_idx
matches = list(_FIGURE_BLOCK.finditer(markdown))
if not matches:
return markdown, desc_idx
n_to_splice = min(len(matches), len(descriptions) - desc_idx)
if n_to_splice <= 0:
return markdown, desc_idx
out = markdown
# Walk in reverse so each splice's end-offset still points at the
# right place in the (still-mutating) string.
for i in range(n_to_splice - 1, -1, -1):
match = matches[i]
desc = descriptions[desc_idx + i]
block = _format_image_block(desc.name, desc.description, desc.ocr_text)
out = out[: match.end()] + block + out[match.end():]
return out, desc_idx + n_to_splice
def inject_descriptions_inline(
markdown: str,
result: PictureExtractionResult,
) -> tuple[str, int]:
"""Splice per-image markdown blocks into the document at image positions.
Walks the markdown left-to-right, consuming descriptions in order.
Tries two splicing strategies, in this order:
1. **Append-after** for ``<figure>...</figure>`` blocks emitted by
layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
premium). The figure block carries the parser's own OCR of the
figure -- we preserve it and add our vision-LLM block right
after.
2. **Replace** for Docling-style markers, in priority order:
- ``<!-- image -->`` followed by ``Image: <filename>`` caption,
- ``<!-- image -->`` placeholder alone,
- bare ``Image: <filename>`` caption.
A document typically uses one style or the other (depending on
which parser produced its markdown), so the two paths don't fight
each other in practice. When they do co-occur, figures are
consumed first.
Returns ``(new_markdown, n_inlined)`` -- the count of descriptions
that were placed inline. The caller decides what to do with any
leftover descriptions (typically: append them at the end).
"""
if not result.descriptions:
return markdown, 0
descriptions = result.descriptions
desc_idx = 0
out = markdown
# Step 1: layout-aware figures. One-shot batch -- finds ALL
# <figure> blocks, splices in document order until we exhaust
# either side.
out, desc_idx = _splice_after_figures(out, descriptions, desc_idx)
# Step 2: Docling-style replacement markers. One match per
# iteration, so a doc that has both a figure (consumed above) and
# a Docling placeholder (consumed below) still works.
while desc_idx < len(descriptions):
before_idx = desc_idx
out, desc_idx = _replace_one_match(
out, _PLACEHOLDER_WITH_CAPTION, descriptions, desc_idx
)
if desc_idx > before_idx:
continue
out, desc_idx = _replace_one_match(
out, _PLACEHOLDER_ONLY, descriptions, desc_idx
)
if desc_idx > before_idx:
continue
out, desc_idx = _replace_one_match(
out, _CAPTION_ONLY, descriptions, desc_idx
)
if desc_idx > before_idx:
continue
# No more positions to splice into.
break
return out, desc_idx
def render_appended_section(
descriptions: list[PictureDescription],
*,
skip_notes: PictureExtractionResult | None = None,
heading: str = "## Image Content (vision-LLM extracted)",
) -> str:
"""Render leftover descriptions as an appended section.
Used as a fallback when not every description could be inlined
(either because the parser produced no detectable image markers,
or because there were more extracted images than markers).
"""
if not descriptions and not skip_notes:
return ""
parts: list[str] = ["", heading, ""]
for desc in descriptions:
parts.append(
_format_image_block(desc.name, desc.description, desc.ocr_text)
)
parts.append("")
if skip_notes is not None:
notes: list[str] = []
if skip_notes.skipped_too_large:
notes.append(f"{skip_notes.skipped_too_large} too large (> 5 MB)")
if skip_notes.skipped_too_small:
notes.append(f"{skip_notes.skipped_too_small} too small (< 1 KB)")
if skip_notes.skipped_duplicate:
notes.append(f"{skip_notes.skipped_duplicate} duplicate")
if skip_notes.failed:
notes.append(f"{skip_notes.failed} failed")
if notes:
parts.append(f"_Note: {', '.join(notes)} image(s) skipped._")
return "\n".join(parts)
def merge_descriptions_into_markdown(
markdown: str,
result: PictureExtractionResult,
) -> str:
"""Top-level: inline what we can, append what's left over.
This is the function the ETL pipeline actually calls. It guarantees
that no successfully-described image is silently dropped: anything
we can't splice inline gets appended at the end with a heading
that makes it clear those came from the document but weren't
location-matched.
"""
if not result.descriptions:
return markdown
new_markdown, n_inlined = inject_descriptions_inline(markdown, result)
leftover = result.descriptions[n_inlined:]
if not leftover:
return new_markdown
# Distinguish in the heading whether NONE were inlined (parser
# produced no markers at all) vs SOME (mismatched count).
heading = (
"## Image Content (vision-LLM extracted)"
if n_inlined == 0
else "## Image Content (additional, no inline marker found)"
)
section = render_appended_section(leftover, heading=heading)
if not section:
return new_markdown
return f"{new_markdown.rstrip()}\n\n{section.lstrip()}\n"
__all__ = [
"PictureDescription",
"PictureExtractionResult",
"describe_pictures",
"inject_descriptions_inline",
"merge_descriptions_into_markdown",
"render_appended_section",
]

View file

@ -77,10 +77,16 @@ class DoclingService:
# Create pipeline options with version-safe attribute checking
pipeline_options = PdfPipelineOptions()
# Disable OCR (user request)
# Enable OCR so text-in-image (chart axes, ECG annotations,
# lab tables embedded as images, scanned pages, etc.) is
# lifted into the main markdown stream. This pairs with the
# vision-LLM picture-description pass downstream — OCR
# captures literal text; vision LLM captures the visual
# content. Together they give a faithful representation of
# PDFs that mix text and images.
if hasattr(pipeline_options, "do_ocr"):
pipeline_options.do_ocr = False
logger.info("⚠️ OCR disabled by user request")
pipeline_options.do_ocr = True
logger.info("✅ OCR enabled for embedded text-in-image extraction")
else:
logger.warning("⚠️ OCR attribute not available in this Docling version")

View file

@ -123,10 +123,6 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
"""Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
from app.etl_pipeline.file_classifier import (
FileCategory,
classify_file as etl_classify,
)
await _notify(ctx, "parsing", "Processing file")
await ctx.task_logger.log_task_progress(
@ -135,8 +131,12 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
{"processing_stage": "extracting"},
)
# Fetch the vision LLM whenever the operator opts in. The ETL
# pipeline decides what to do with it: image files run through the
# vision LLM directly; document files (PDFs) get per-image
# descriptions appended via picture_describer.
vision_llm = None
if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
if ctx.use_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
@ -230,7 +230,16 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
await _notify(ctx, "parsing", "Extracting content")
etl_result = await EtlPipelineService().extract(
# Document files (PDF, docx, etc.) get vision LLM treatment too:
# the ETL pipeline appends a per-image description section when
# vision_llm is provided. See picture_describer.describe_pictures.
vision_llm = None
if ctx.use_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(
file_path=ctx.file_path,
filename=ctx.filename,
@ -418,8 +427,12 @@ async def _extract_file_content(
billable_pages = estimated_pages * mode.page_multiplier
await page_limit_service.check_page_limit(user_id, billable_pages)
# Vision LLM is provided to the ETL pipeline for any file category
# when the operator opts in. Image files run through it directly;
# document files (PDFs) get per-image descriptions appended via
# picture_describer.
vision_llm = None
if use_vision_llm and category == FileCategory.IMAGE:
if use_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)

View file

@ -0,0 +1,53 @@
#!/bin/bash
# =============================================================================
# E2E entrypoint for the multi-stage Dockerfile's `e2e` target.
#
# Dispatches on SERVICE_ROLE to the test-only entrypoints under tests/e2e/.
# Those scripts apply sys.modules hijacks and LLM/embedding patches BEFORE
# importing production app code (see tests/e2e/run_backend.py for rationale).
#
# Production never sees this file: tests/ is excluded from the production
# stage, and the production stage uses scripts/docker/entrypoint.sh.
# =============================================================================
set -euo pipefail
SERVICE_ROLE="${SERVICE_ROLE:-api}"
echo "[e2e-entrypoint] starting role=${SERVICE_ROLE}"
wait_for_db() {
# Block until the database is reachable. We don't loop forever — Compose
# depends_on/healthchecks already gate on db readiness, this is just
# belt-and-suspenders so a slow first connection doesn't race migrations.
for i in {1..60}; do
echo "[e2e-entrypoint] db check attempt ${i}/60"
if python -c "from app.db import engine; import asyncio; asyncio.run(engine.dispose())"; then
echo "[e2e-entrypoint] db reachable after ${i} attempts"
return 0
fi
sleep 1
done
echo "[e2e-entrypoint] ERROR: db not reachable after 60s" >&2
return 1
}
case "${SERVICE_ROLE}" in
api)
wait_for_db
echo "[e2e-entrypoint] running alembic upgrade head"
alembic upgrade head
# `exec` so SIGTERM from `docker stop` reaches Python directly,
# without a shell wrapper interposing.
exec python tests/e2e/run_backend.py
;;
worker)
# Worker doesn't run migrations — the api role does that exactly once.
# We still wait for db so Celery's broker connection check doesn't
# race against an unready Postgres on cold start.
wait_for_db
exec python tests/e2e/run_celery.py
;;
*)
echo "[e2e-entrypoint] ERROR: unknown SERVICE_ROLE='${SERVICE_ROLE}' (expected: api | worker)" >&2
exit 1
;;
esac

View file

@ -1,48 +1,48 @@
# Backend E2E Test Harness
# Backend E2E Harness
Strict fakes + alternative entrypoints used **only** by Playwright E2E.
Excluded from the production Docker image via `.dockerignore`.
This directory contains the test-only backend entrypoints and fakes used by
Playwright. They are not part of the production image: `.dockerignore` excludes
`tests/`, and the E2E Docker stage copies this directory through a separate
build context.
## Files
| Path | Role |
| -------------------------------- | ------------------------------------------------------------------------------- |
| `run_backend.py` | FastAPI entrypoint that hijacks `sys.modules` before importing `app.app:app` |
| `run_celery.py` | Celery worker entrypoint with the same hijack + patch logic |
| `middleware/scenario.py` | `X-E2E-Scenario` header → ContextVar (read by fakes) |
| `fakes/composio_module.py` | Strict drop-in for the `composio` package; raises on unknown surface |
| `fakes/llm.py` | `fake_get_user_long_context_llm` returning a `FakeListChatModel` |
| `fakes/embeddings.py` | Deterministic 0.1-vector `embed_text` / `embed_texts` |
| `fakes/fixtures/drive_files.json`| Canned Drive listings + file contents (incl. canary tokens) |
| Path | Purpose |
| --- | --- |
| `run_backend.py` | Starts FastAPI after installing the test fakes into `sys.modules`. |
| `run_celery.py` | Starts the Celery worker with the same fake setup. |
| `middleware/scenario.py` | Reads `X-E2E-Scenario` into a request-scoped context var. |
| `fakes/composio_module.py` | Fake `composio` package used by connector flows. |
| `fakes/llm.py` | Fake chat model factory. |
| `fakes/embeddings.py` | Deterministic embedding helpers. |
| `fakes/fixtures/drive_files.json` | Drive fixture data and canary file contents. |
## Why a sys.modules hijack?
## Why the import hook exists
Production code does `from composio import Composio` at module load
time. By the time the FastAPI app object exists, that binding has
already been resolved. The hijack runs **before** any `app.*` import,
so the binding resolves to our strict fake. No production source
changes; fakes are physically excluded from production images.
Some production modules import SDK clients at module load time, for example
`from composio import Composio`. By the time `app.app` has been imported, those
bindings are already fixed.
Belt + suspenders + no internet: the strict `__getattr__` in every
fake raises `NotImplementedError` if a future production code path
introduces a new SDK call. CI also sets `HTTPS_PROXY=http://127.0.0.1:1`
plus sentinel API keys so any leaked outbound HTTP fails immediately.
The E2E entrypoints install fake modules in `sys.modules` before importing any
`app.*` module. That lets the normal production code run while SDK calls resolve
to local fakes.
## Adding a new fake
The fakes should fail loudly. If production starts using a new SDK method that
the fake does not implement, add that method to the fake instead of letting the
test call the real service.
1. Create `fakes/<sdk>_module.py` modelled on `composio_module.py`.
2. In `run_backend.py` and `run_celery.py`, register
`sys.modules["<sdk>"] = _fake_<sdk>` before the `from app.app import app`
line.
3. If the new fake needs scenario branching, read from
## Adding a fake
1. Add `fakes/<sdk>_module.py`.
2. Register it in both `run_backend.py` and `run_celery.py` before importing
`app.app` or `app.celery_app`.
3. If the fake needs per-test behavior, read the current scenario from
`tests.e2e.middleware.scenario.current_scenario()`.
## Reused by backend integration tests
## Shared with backend integration tests
The strict fakes are not only for Playwright. Backend route integration
tests can import the same fake before importing `app.app`, so Composio
route tests exercise production route code without touching the real
SDK:
Backend integration tests can use the same fakes when they need production route
code without the real SDK:
```python
from tests.e2e.fakes import composio_module as _fake_composio
@ -50,20 +50,93 @@ sys.modules["composio"] = _fake_composio
from app.app import app
```
See `surfsense_backend/tests/integration/composio/conftest.py` for the
current pattern.
See `surfsense_backend/tests/integration/composio/conftest.py` for the current
pattern.
## Running locally
The recommended local flow runs only Postgres and Redis in Docker, and the
backend + Celery worker on the host. No `.env` file is required: both
entrypoints `setdefault` every variable they need (DB URL, Redis URL,
sentinel API keys, etc.) to values that match `docker-compose.deps-only.yml`.
### One-time setup
From `surfsense_web/`:
```bash
cd surfsense_backend
pnpm install
pnpm exec playwright install --with-deps chromium
```
### Each run
**1. Bring up Postgres + Redis** from the repo root (the other deps-only
services (SearXNG, Zero, pgAdmin) are not needed for E2E):
```bash
docker compose -f docker/docker-compose.deps-only.yml up -d db redis
```
**2. Start the backend** in `surfsense_backend/`, terminal A:
```bash
uv sync
uv run alembic upgrade head
uv run python tests/e2e/run_backend.py
# in a second shell:
```
**3. Start the Celery worker** in `surfsense_backend/`, terminal B:
```bash
uv run python tests/e2e/run_celery.py
```
Then in `surfsense_web`:
**4. Register the Playwright user**:
```bash
pnpm test:e2e
curl -X POST http://localhost:8000/auth/register \
-H "Content-Type: application/json" \
-d '{"email":"e2e-test@surfsense.net","password":"E2eTestPassword123!"}'
```
**5. Run Playwright** from `surfsense_web/`, terminal C:
```bash
pnpm test:e2e # dev server (fast iteration)
pnpm test:e2e:headed # show the browser
pnpm test:e2e:ui # Playwright UI mode
pnpm test:e2e:prod # build + start (matches CI exactly)
```
`playwright.config.ts` and the run scripts share defaults, so this works on a
fresh checkout. Set `PLAYWRIGHT_TEST_EMAIL`, `PLAYWRIGHT_TEST_PASSWORD`,
`NEXT_PUBLIC_FASTAPI_BACKEND_URL`, or any backend env (e.g. `DATABASE_URL`)
only when pointing tests at a different stack.
### Cleanup
```bash
docker compose -f docker/docker-compose.deps-only.yml down
```
Add `-v` to also wipe the Postgres volume.
### Hermetic alternative (matches CI)
To reproduce the CI environment exactly — backend and Celery in containers,
network egress denied at L3 — replace steps 13 with:
```bash
docker compose -f docker/docker-compose.e2e.yml up -d --build --wait
```
Then run steps 4 (curl register) and 5 (`pnpm test:e2e:prod`) as above. Tear
down with:
```bash
docker compose -f docker/docker-compose.e2e.yml down -v --remove-orphans
```
This builds the ~9 GB `surfsense-e2e-backend:local` image, so the deps-only
flow above is faster for day-to-day development.

View file

@ -0,0 +1,66 @@
"""Test-only token mint endpoint for the E2E backend entrypoint.
Mounted by ``tests/e2e/run_backend.py`` so Playwright can authenticate
the seeded e2e user without hitting ``/auth/jwt/login`` (rate-limited
to 5/min/IP in production). NEVER ships to production: this whole
``tests/`` tree is excluded from the production Docker image by
``surfsense_backend/.dockerignore``.
Authn: shared secret in ``X-E2E-Mint-Secret``. Same value is set on the
backend container env (``docker/docker-compose.e2e.yml``) and exported
to the Playwright runner (``.github/workflows/e2e-tests.yml``).
"""
from __future__ import annotations
import logging
import os
from fastapi import APIRouter, FastAPI, Header, HTTPException
from pydantic import BaseModel
from sqlalchemy import select
from app.db import User, async_session_maker
from app.users import get_jwt_strategy
_logger = logging.getLogger("surfsense.e2e.auth_mint")
class MintRequest(BaseModel):
email: str = "e2e-test@surfsense.net"
class MintResponse(BaseModel):
access_token: str
token_type: str = "bearer"
def _expected_secret() -> str:
return os.environ.get("E2E_MINT_SECRET", "local-e2e-mint-secret-not-for-production")
router = APIRouter(prefix="/__e2e__", tags=["__e2e__"])
@router.post("/auth/token", response_model=MintResponse)
async def mint_test_token(
body: MintRequest,
x_e2e_mint_secret: str = Header(..., alias="X-E2E-Mint-Secret"),
) -> MintResponse:
if x_e2e_mint_secret != _expected_secret():
raise HTTPException(status_code=403, detail="invalid e2e mint secret")
async with async_session_maker() as session:
result = await session.execute(select(User).where(User.email == body.email))
user = result.scalar_one_or_none()
if user is None:
raise HTTPException(
status_code=404, detail=f"e2e user {body.email!r} not seeded"
)
token = await get_jwt_strategy().write_token(user)
return MintResponse(access_token=token)
def install(app: FastAPI) -> None:
"""Mount the test-only mint router onto the given FastAPI app."""
app.include_router(router)
_logger.warning("[e2e] mounted POST /__e2e__/auth/token (test-only token mint)")

View file

@ -0,0 +1,141 @@
"""Stub DoclingService.process_document for E2E.
The real ``DoclingService.process_document`` calls
``DocumentConverter.convert(file_path)`` which lazily downloads the
``docling-project/docling-layout-heron`` model from Hugging Face Hub.
The hermetic E2E container sets ``HF_HUB_OFFLINE=1`` (see
``docker/docker-compose.e2e.yml``), so that download fails with
``LocalEntryNotFoundError`` and the indexing Celery task retries until
the Playwright test hits its ~4-minute step timeout. In CI that is the
difference between the suite finishing and the 30-minute job timeout
killing the run before any report can upload.
Stubbing ``process_document`` bypasses ``DocumentConverter.convert()``
entirely. ``DoclingService.__init__`` is intentionally left untouched
because constructing ``DocumentConverter(...)`` is cheap and offline
it is only ``.convert()`` that triggers the offline-model download.
Every canary PDF under ``tests/e2e/fakes/fixtures/binary/`` is produced
by ``generate_canary_pdfs.py`` and embeds its canary token as plain
``(text) Tj`` PDF text operators. Extracting those operators gives us
the canary string back, which is what the Playwright assertions look
for in the resulting Document row.
"""
from __future__ import annotations
import logging
import re
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
# Matches the `(escaped text) Tj` text-show operator emitted by
# generate_canary_pdfs.py. Inside the parens, the escape rules are:
# \\ -> backslash
# \( -> literal (
# \) -> literal )
# The character class [^\\()] consumes any non-escape byte; \\. consumes
# an escape sequence. Sufficient for our synthetic fixtures.
_TJ_PATTERN = re.compile(rb"\(((?:[^\\()]|\\.)*)\)\s*Tj")
def _extract_text_from_synthetic_pdf(file_path: str) -> str:
"""Pull every ``(text) Tj`` payload out of a fixture PDF in order.
Returns an empty string if the file cannot be read. We do not try to
handle arbitrary PDFs because the fake is only ever invoked against
fixtures we generate ourselves.
"""
try:
data = Path(file_path).read_bytes()
except OSError as exc:
logger.warning("[fake-docling] could not read %s: %s", file_path, exc)
return ""
lines: list[str] = []
for match in _TJ_PATTERN.finditer(data):
raw = match.group(1)
# Order-sensitive unescape via sentinel: protect `\\` first so
# the subsequent `\(` / `\)` passes do not corrupt it.
text = (
raw.replace(rb"\\", b"\x00")
.replace(rb"\(", b"(")
.replace(rb"\)", b")")
.replace(b"\x00", b"\\")
)
try:
lines.append(text.decode("utf-8"))
except UnicodeDecodeError:
lines.append(text.decode("latin-1"))
return "\n".join(lines)
async def fake_process_document(
self,
file_path: str,
filename: str | None = None,
) -> dict[str, Any]:
"""Drop-in replacement for ``DoclingService.process_document``.
Returns the same dict shape as the production method so callers
(``app/etl_pipeline/parsers/docling.py``) can keep reading
``result["content"]`` without changes.
"""
extracted = _extract_text_from_synthetic_pdf(file_path)
display_name = filename or Path(file_path).name
if extracted:
content = f"# {display_name}\n\n{extracted}\n"
else:
# Empty fallback so the indexing pipeline does not error out on
# an unexpected payload. A failing canary assertion is a much
# clearer failure mode than a hard parser exception.
content = (
f"# {display_name}\n\n(empty docling fake — no text-show operators found)\n"
)
logger.info(
"[fake-docling] returning %d chars for %s",
len(content),
display_name,
)
return {
"content": content,
"full_text": content,
"service_used": "docling-fake",
"status": "success",
"processing_notes": "e2e fake DoclingService — no real PDF parsing",
}
def install(patches: list[Any]) -> None:
"""Patch ``DoclingService.process_document`` at the class level.
Patching the class method (rather than each call site) is correct
here because every consumer goes through
``create_docling_service()`` ``DoclingService()`` instance method
dispatch, so the descriptor protocol picks up our replacement. There
is exactly one such consumer today
(``app/etl_pipeline/parsers/docling.py``), but patching the class is
future-proof.
Fails loud rather than warning, because a silent passthrough means
real Docling + ``HF_HUB_OFFLINE=1`` = 4 minutes of CI hang per test.
"""
from unittest.mock import patch as _patch
target = "app.services.docling_service.DoclingService.process_document"
try:
p = _patch(target, fake_process_document)
p.start()
patches.append(p)
logger.info("[fake-docling] patched %s", target)
except (ModuleNotFoundError, AttributeError) as exc:
raise RuntimeError(
f"Could not patch Docling binding {target!r}: {exc!s}. "
f"Update surfsense_backend/tests/e2e/fakes/docling_service.py "
f"to point at the new binding site."
) from exc

View file

@ -0,0 +1,71 @@
# Synthetic Global LLM configuration for E2E ONLY.
#
# Why this file exists:
# surfsense_backend/app/config/global_llm_config.yaml is gitignored
# (operators ship real API keys there). In CI that file does not exist,
# so app.config.load_global_llm_configs() returns [], every chat-stream
# test fails fast with "No usable global LLM configs are available for
# Auto mode" raised by auto_model_pin_service._global_candidates().
#
# What this file does:
# tests/e2e/run_backend.py and tests/e2e/run_celery.py copy this file
# to app/config/global_llm_config.yaml at startup, BEFORE app.config
# is imported. The copy lives only inside the E2E Docker container.
#
# Why a fake api_key is safe:
# tests.e2e.fakes.chat_llm patches
# app.tasks.chat.stream_new_chat.create_chat_litellm_from_agent_config
# app.tasks.chat.stream_new_chat.create_chat_litellm_from_config
# so the resolved auto-pin id is never sent to a real LLM provider.
# The values below only need to pass
# auto_model_pin_service._is_usable_global_config()
# which requires id / model_name / provider / api_key all truthy.
#
# Why TWO entries (premium + free):
# auto_model_pin_service.resolve_or_get_pinned_llm_config_id() splits
# candidates by billing_tier based on _is_premium_eligible(user):
# premium_eligible == True -> keeps only tier=="premium" configs
# premium_eligible == False -> keeps only tier!="premium" configs
# A single-tier fixture would fail one of the two branches with
# "Auto mode could not find an eligible LLM config for this user and
# quota state". Shipping one of each guarantees every quota state
# resolves to a viable pin in E2E.
router_settings:
routing_strategy: "simple-shuffle"
num_retries: 0
allowed_fails: 1
cooldown_time: 1
global_llm_configs:
- id: -9001
name: "E2E Fake Auto Model (premium)"
billing_tier: "premium"
anonymous_enabled: false
seo_enabled: false
quality_score: 1.0
provider: "OPENAI"
model_name: "fake-e2e-model-premium"
api_key: "fake-e2e-api-key-not-for-production"
supports_image_input: false
quota_reserve_tokens: 1024
rpm: 1000
tpm: 100000
litellm_params:
model: "openai/fake-e2e-model-premium"
- id: -9002
name: "E2E Fake Auto Model (free)"
billing_tier: "free"
anonymous_enabled: false
seo_enabled: false
quality_score: 1.0
provider: "OPENAI"
model_name: "fake-e2e-model-free"
api_key: "fake-e2e-api-key-not-for-production"
supports_image_input: false
quota_reserve_tokens: 1024
rpm: 1000
tpm: 100000
litellm_params:
model: "openai/fake-e2e-model-free"

View file

@ -23,15 +23,12 @@ Usage:
from __future__ import annotations
import asyncio
import logging
import os
import sys
# ---------------------------------------------------------------------------
# 1) Hijack sys.modules BEFORE any production import.
# Production: composio_service.py:11 does `from composio import Composio`.
# With this hijack in place, that import resolves to our strict fake.
# ---------------------------------------------------------------------------
import uvicorn
# Make the surfsense_backend root importable as a top-level package so
# `import tests.e2e.fakes...` works regardless of how the entrypoint is
@ -42,97 +39,175 @@ _BACKEND_ROOT = os.path.abspath(os.path.join(_THIS_DIR, "..", ".."))
if _BACKEND_ROOT not in sys.path:
sys.path.insert(0, _BACKEND_ROOT)
import tests.e2e.fakes.composio_module as _fake_composio # noqa: E402
import tests.e2e.fakes.notion_module as _fake_notion # noqa: E402
sys.modules["composio"] = _fake_composio
sys.modules["notion_client"] = _fake_notion
sys.modules["notion_client.errors"] = _fake_notion.errors
# ---------------------------------------------------------------------------
# 2) Standard logging + dotenv so the rest of the app behaves like main.py.
# ---------------------------------------------------------------------------
from dotenv import load_dotenv # noqa: E402
load_dotenv()
os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
os.environ.setdefault(
"CONFLUENCE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/confluence/connector/callback",
)
os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
os.environ.setdefault(
"NOTION_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/notion/connector/callback",
)
os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
os.environ.setdefault(
"ONEDRIVE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/onedrive/connector/callback",
)
os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
os.environ.setdefault(
"DROPBOX_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/dropbox/connector/callback",
)
os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("surfsense.e2e.backend")
logger.warning(
"*** SURFSENSE E2E BACKEND ENTRYPOINT — fake Composio + LLM + embeddings ***"
)
# ---------------------------------------------------------------------------
# 3) Now import the production app. Every module in app.* loads here,
# creating their bindings (some of which we will patch in step 4).
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# 4) Patch LLM + embedding bindings at every consumer site.
# Composio is already covered by the sys.modules hijack in step 1.
# ---------------------------------------------------------------------------
from unittest.mock import patch # noqa: E402
from app.app import app # noqa: E402
from tests.e2e.fakes import ( # noqa: E402
clickup_module as _fake_clickup_module,
confluence_indexer as _fake_confluence_indexer,
confluence_oauth as _fake_confluence_oauth,
dropbox_api as _fake_dropbox_api,
embeddings as _fake_embeddings,
jira_module as _fake_jira_module,
linear_module as _fake_linear_module,
mcp_oauth_runtime as _fake_mcp_oauth_runtime,
mcp_runtime as _fake_mcp_runtime,
native_google as _fake_native_google,
notion_module as _fake_notion_module,
onedrive_graph as _fake_onedrive_graph,
slack_module as _fake_slack_module,
)
from tests.e2e.fakes.chat_llm import ( # noqa: E402
fake_create_chat_litellm_from_agent_config,
fake_create_chat_litellm_from_config,
)
from tests.e2e.fakes.llm import fake_get_user_long_context_llm # noqa: E402
# Patches started during bootstrap are kept alive for the lifetime of the
# process. We never call .stop() on them.
_active_patches: list = []
def _hijack_external_sdks() -> None:
"""Replace composio + notion_client in sys.modules.
Production does ``from composio import Composio`` and
``import notion_client`` at import time. With this hijack in place,
those imports resolve to our strict fakes.
MUST run before _import_production_app().
"""
import tests.e2e.fakes.composio_module as _fake_composio
import tests.e2e.fakes.notion_module as _fake_notion
sys.modules["composio"] = _fake_composio
sys.modules["notion_client"] = _fake_notion
sys.modules["notion_client.errors"] = _fake_notion.errors
def _load_dotenv_and_set_env_defaults() -> None:
"""Load .env and set every env var the production config reads on import.
MUST run before _import_production_app(), since app.config consumes
these values at import time.
"""
from dotenv import load_dotenv
load_dotenv()
os.environ.setdefault(
"DATABASE_URL",
"postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense",
)
os.environ.setdefault("CELERY_BROKER_URL", "redis://localhost:6379/0")
os.environ.setdefault("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
os.environ.setdefault("REDIS_APP_URL", "redis://localhost:6379/0")
os.environ.setdefault("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
os.environ.setdefault("SECRET_KEY", "local-e2e-secret-not-for-production")
os.environ.setdefault("AUTH_TYPE", "LOCAL")
os.environ.setdefault("REGISTRATION_ENABLED", "TRUE")
os.environ.setdefault("ETL_SERVICE", "DOCLING")
os.environ.setdefault("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
os.environ.setdefault("NEXT_FRONTEND_URL", "http://localhost:3000")
# Sentinel keys — fakes never read them; turns leaked real calls into 401s.
os.environ.setdefault("COMPOSIO_API_KEY", "local-deny-real-call-sentinel")
os.environ.setdefault("COMPOSIO_ENABLED", "TRUE")
os.environ.setdefault("OPENAI_API_KEY", "local-deny-real-call-sentinel")
os.environ.setdefault("ANTHROPIC_API_KEY", "local-deny-real-call-sentinel")
os.environ.setdefault("LITELLM_API_KEY", "local-deny-real-call-sentinel")
os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
os.environ.setdefault(
"CONFLUENCE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/confluence/connector/callback",
)
os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
os.environ.setdefault(
"NOTION_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/notion/connector/callback",
)
os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
os.environ.setdefault(
"ONEDRIVE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/onedrive/connector/callback",
)
os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
os.environ.setdefault(
"DROPBOX_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/dropbox/connector/callback",
)
# Native Google OAuth — fake Flow in tests.e2e.fakes.native_google
# raises "Fake Google Flow requires redirect_uri." if these are empty,
# so connector/add routes return 500 in CI where no .env supplies them.
os.environ.setdefault(
"GOOGLE_DRIVE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/google/drive/connector/callback",
)
os.environ.setdefault(
"GOOGLE_GMAIL_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/google/gmail/connector/callback",
)
os.environ.setdefault(
"GOOGLE_CALENDAR_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/google/calendar/connector/callback",
)
os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
def _install_synthetic_global_llm_config() -> None:
"""Materialise a fake ``app/config/global_llm_config.yaml`` for E2E.
The real file is gitignored (production operators ship their own with
real API keys), so a fresh CI checkout has no YAML at the path
``app.config.load_global_llm_configs()`` reads. With an empty
``GLOBAL_LLM_CONFIGS`` list, ``auto_model_pin_service`` raises
``"No usable global LLM configs are available for Auto mode"`` on
every chat-stream request.
We copy the synthetic fixture from ``tests/e2e/fixtures/`` into the
production-expected location BEFORE ``_import_production_app()`` so
``app.config`` picks it up on import. Production code is untouched
this is purely a test-time scaffold.
Only installs when the destination is missing. A developer running
the E2E entrypoint locally keeps their real ``global_llm_config.yaml``
intact (the patched ``create_chat_litellm_from_*`` factories make the
actual model values irrelevant either way).
MUST run before _import_production_app().
"""
import shutil
src = os.path.join(_THIS_DIR, "fixtures", "global_llm_config.yaml")
dst = os.path.join(_BACKEND_ROOT, "app", "config", "global_llm_config.yaml")
if not os.path.exists(src):
raise RuntimeError(
f"E2E synthetic global LLM config fixture missing at {src!r}. "
f"This file is checked into tests/e2e/fixtures/ — if it has gone "
f"missing, restore it from VCS before running the E2E entrypoint."
)
if os.path.exists(dst):
logger.info(
"[e2e-global-llm-config] %s already exists; leaving it alone "
"(local dev config preserved)",
dst,
)
return
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.copyfile(src, dst)
logger.info("[e2e-global-llm-config] installed %s -> %s", src, dst)
def _import_production_app():
"""Import and return the production FastAPI app.
Every module under ``app.*`` loads here, creating their bindings.
The LLM/embedding factories captured at this point will be replaced
by patches in _patch_llm_bindings() below.
"""
from app.app import app as production_app
return production_app
def _patch_llm_bindings() -> None:
"""Replace LLM factories at every known binding site."""
from unittest.mock import patch
from tests.e2e.fakes.chat_llm import (
fake_create_chat_litellm_from_agent_config,
fake_create_chat_litellm_from_config,
)
from tests.e2e.fakes.llm import fake_get_user_long_context_llm
targets = [
"app.services.llm_service.get_user_long_context_llm",
"app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
@ -190,38 +265,90 @@ def _patch_llm_bindings() -> None:
logger.warning("[fake-chat-llm] could not patch %s: %s.", target, exc)
_patch_llm_bindings()
_fake_embeddings.install(_active_patches)
_fake_confluence_oauth.install(_active_patches)
_fake_confluence_indexer.install(_active_patches)
_fake_native_google.install(_active_patches)
_fake_onedrive_graph.install(_active_patches)
_fake_dropbox_api.install(_active_patches)
_fake_notion_module.install(_active_patches)
_fake_linear_module.install(_active_patches)
_fake_jira_module.install(_active_patches)
_fake_clickup_module.install(_active_patches)
_fake_mcp_runtime.install(_active_patches)
_fake_mcp_oauth_runtime.install(_active_patches)
_fake_slack_module.install(_active_patches)
def _install_runtime_fakes() -> None:
"""Run each fake's install() against the active patch stack."""
from tests.e2e.fakes import (
clickup_module as _fake_clickup_module,
confluence_indexer as _fake_confluence_indexer,
confluence_oauth as _fake_confluence_oauth,
docling_service as _fake_docling_service,
dropbox_api as _fake_dropbox_api,
embeddings as _fake_embeddings,
jira_module as _fake_jira_module,
linear_module as _fake_linear_module,
mcp_oauth_runtime as _fake_mcp_oauth_runtime,
mcp_runtime as _fake_mcp_runtime,
native_google as _fake_native_google,
notion_module as _fake_notion_module,
onedrive_graph as _fake_onedrive_graph,
slack_module as _fake_slack_module,
)
_fake_embeddings.install(_active_patches)
_fake_docling_service.install(_active_patches)
_fake_confluence_oauth.install(_active_patches)
_fake_confluence_indexer.install(_active_patches)
_fake_native_google.install(_active_patches)
_fake_onedrive_graph.install(_active_patches)
_fake_dropbox_api.install(_active_patches)
_fake_notion_module.install(_active_patches)
_fake_linear_module.install(_active_patches)
_fake_jira_module.install(_active_patches)
_fake_clickup_module.install(_active_patches)
_fake_mcp_runtime.install(_active_patches)
_fake_mcp_oauth_runtime.install(_active_patches)
_fake_slack_module.install(_active_patches)
# ---------------------------------------------------------------------------
# 5) Mount test-only middleware. Production never reaches this code.
# ---------------------------------------------------------------------------
def _install_test_only_app_extensions(app) -> None:
"""Mount test-only middleware + the /__e2e__ token mint router.
from tests.e2e.middleware.scenario import ScenarioMiddleware # noqa: E402
POST /__e2e__/auth/token bypasses /auth/jwt/login's 5/min/IP rate
limit so Playwright workers can authenticate without thrashing the
production auth surface. See tests/e2e/auth_mint.py.
"""
from tests.e2e.auth_mint import install as install_e2e_mint
from tests.e2e.middleware.scenario import ScenarioMiddleware
app.add_middleware(ScenarioMiddleware)
app.add_middleware(ScenarioMiddleware)
install_e2e_mint(app)
# ---------------------------------------------------------------------------
# 6) Start uvicorn, mirroring main.py's behaviour.
# ---------------------------------------------------------------------------
def _bootstrap():
"""Run the full E2E bootstrap and return the production FastAPI app.
import asyncio # noqa: E402
Ordering is load-bearing:
1) Hijack composio + notion_client in sys.modules.
2) Load .env + set env defaults (app.config reads env on import).
3) Configure logging.
4) Materialise the synthetic global_llm_config.yaml so Auto-mode
pin resolution finds at least one usable candidate.
5) Import production app (which transitively imports the now-faked
external SDKs and reads the env defaults + YAML).
6) Patch LLM / embedding bindings at every consumer site.
7) Mount test-only middleware + /__e2e__ routes onto the app.
"""
_hijack_external_sdks()
_load_dotenv_and_set_env_defaults()
import uvicorn # noqa: E402
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger.warning(
"*** SURFSENSE E2E BACKEND ENTRYPOINT — fake Composio + LLM + embeddings ***"
)
_install_synthetic_global_llm_config()
production_app = _import_production_app()
_patch_llm_bindings()
_install_runtime_fakes()
_install_test_only_app_extensions(production_app)
return production_app
app = _bootstrap()
def _main() -> None:

View file

@ -25,96 +25,166 @@ if _BACKEND_ROOT not in sys.path:
sys.path.insert(0, _BACKEND_ROOT)
# ---------------------------------------------------------------------------
# 1) Hijack sys.modules BEFORE production celery imports anything.
# ---------------------------------------------------------------------------
import tests.e2e.fakes.composio_module as _fake_composio # noqa: E402
import tests.e2e.fakes.notion_module as _fake_notion # noqa: E402
sys.modules["composio"] = _fake_composio
sys.modules["notion_client"] = _fake_notion
sys.modules["notion_client.errors"] = _fake_notion.errors
# ---------------------------------------------------------------------------
# 2) Logging + dotenv.
# ---------------------------------------------------------------------------
from dotenv import load_dotenv # noqa: E402
load_dotenv()
os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
os.environ.setdefault(
"CONFLUENCE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/confluence/connector/callback",
)
os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
os.environ.setdefault(
"NOTION_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/notion/connector/callback",
)
os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
os.environ.setdefault(
"ONEDRIVE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/onedrive/connector/callback",
)
os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
os.environ.setdefault(
"DROPBOX_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/dropbox/connector/callback",
)
os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("surfsense.e2e.celery")
logger.warning("*** SURFSENSE E2E CELERY WORKER — fake Composio + LLM + embeddings ***")
# ---------------------------------------------------------------------------
# 3) Import the production celery_app. All task modules load here.
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# 4) Patch LLM + embedding bindings inside the worker process.
# ---------------------------------------------------------------------------
from unittest.mock import patch # noqa: E402
from app.celery_app import celery_app # noqa: E402
from tests.e2e.fakes import ( # noqa: E402
clickup_module as _fake_clickup_module,
confluence_indexer as _fake_confluence_indexer,
confluence_oauth as _fake_confluence_oauth,
dropbox_api as _fake_dropbox_api,
embeddings as _fake_embeddings,
jira_module as _fake_jira_module,
linear_module as _fake_linear_module,
mcp_oauth_runtime as _fake_mcp_oauth_runtime,
mcp_runtime as _fake_mcp_runtime,
native_google as _fake_native_google,
notion_module as _fake_notion_module,
onedrive_graph as _fake_onedrive_graph,
slack_module as _fake_slack_module,
)
from tests.e2e.fakes.chat_llm import ( # noqa: E402
fake_create_chat_litellm_from_agent_config,
fake_create_chat_litellm_from_config,
)
from tests.e2e.fakes.llm import fake_get_user_long_context_llm # noqa: E402
# Patches started during bootstrap are kept alive for the lifetime of the
# process. We never call .stop() on them.
_active_patches: list = []
def _hijack_external_sdks() -> None:
"""Replace composio + notion_client in sys.modules.
Production does ``from composio import Composio`` and
``import notion_client`` at import time. With this hijack in place,
those imports resolve to our strict fakes.
MUST run before _import_celery_app().
"""
import tests.e2e.fakes.composio_module as _fake_composio
import tests.e2e.fakes.notion_module as _fake_notion
sys.modules["composio"] = _fake_composio
sys.modules["notion_client"] = _fake_notion
sys.modules["notion_client.errors"] = _fake_notion.errors
def _load_dotenv_and_set_env_defaults() -> None:
"""Load .env and set every env var the production config reads on import.
MUST run before _import_celery_app(), since app.config consumes
these values at import time.
"""
from dotenv import load_dotenv
load_dotenv()
os.environ.setdefault(
"DATABASE_URL",
"postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense",
)
os.environ.setdefault("CELERY_BROKER_URL", "redis://localhost:6379/0")
os.environ.setdefault("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
os.environ.setdefault("REDIS_APP_URL", "redis://localhost:6379/0")
os.environ.setdefault("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
os.environ.setdefault("SECRET_KEY", "local-e2e-secret-not-for-production")
os.environ.setdefault("AUTH_TYPE", "LOCAL")
os.environ.setdefault("REGISTRATION_ENABLED", "TRUE")
os.environ.setdefault("ETL_SERVICE", "DOCLING")
os.environ.setdefault("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
os.environ.setdefault("NEXT_FRONTEND_URL", "http://localhost:3000")
# Sentinel keys — fakes never read them; turns leaked real calls into 401s.
os.environ.setdefault("COMPOSIO_API_KEY", "local-deny-real-call-sentinel")
os.environ.setdefault("COMPOSIO_ENABLED", "TRUE")
os.environ.setdefault("OPENAI_API_KEY", "local-deny-real-call-sentinel")
os.environ.setdefault("ANTHROPIC_API_KEY", "local-deny-real-call-sentinel")
os.environ.setdefault("LITELLM_API_KEY", "local-deny-real-call-sentinel")
os.environ.setdefault("ATLASSIAN_CLIENT_ID", "fake-atlassian-client-id")
os.environ.setdefault("ATLASSIAN_CLIENT_SECRET", "fake-atlassian-client-secret")
os.environ.setdefault(
"CONFLUENCE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/confluence/connector/callback",
)
os.environ.setdefault("NOTION_CLIENT_ID", "fake-notion-client-id")
os.environ.setdefault("NOTION_CLIENT_SECRET", "fake-notion-client-secret")
os.environ.setdefault(
"NOTION_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/notion/connector/callback",
)
os.environ.setdefault("MICROSOFT_CLIENT_ID", "fake-microsoft-client-id")
os.environ.setdefault("MICROSOFT_CLIENT_SECRET", "fake-microsoft-client-secret")
os.environ.setdefault(
"ONEDRIVE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/onedrive/connector/callback",
)
os.environ.setdefault("DROPBOX_APP_KEY", "fake-dropbox-app-key")
os.environ.setdefault("DROPBOX_APP_SECRET", "fake-dropbox-app-secret")
os.environ.setdefault(
"DROPBOX_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/dropbox/connector/callback",
)
# Native Google OAuth — fake Flow in tests.e2e.fakes.native_google raises
# "Fake Google Flow requires redirect_uri." when these are empty.
os.environ.setdefault(
"GOOGLE_DRIVE_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/google/drive/connector/callback",
)
os.environ.setdefault(
"GOOGLE_GMAIL_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/google/gmail/connector/callback",
)
os.environ.setdefault(
"GOOGLE_CALENDAR_REDIRECT_URI",
"http://localhost:8000/api/v1/auth/google/calendar/connector/callback",
)
os.environ["SLACK_CLIENT_ID"] = "fake-slack-mcp-client-id"
os.environ["SLACK_CLIENT_SECRET"] = "fake-slack-mcp-client-secret"
def _install_synthetic_global_llm_config() -> None:
"""Materialise a fake ``app/config/global_llm_config.yaml`` for E2E.
The real file is gitignored (production operators ship their own with
real API keys), so a fresh CI checkout has no YAML at the path
``app.config.load_global_llm_configs()`` reads. With an empty
``GLOBAL_LLM_CONFIGS`` list, the worker's view of the config diverges
from the API container.
We copy the synthetic fixture from ``tests/e2e/fixtures/`` into the
production-expected location BEFORE _import_celery_app() so
``app.config`` picks it up on import. Install-only-if-missing so a
developer's local config (with real API keys) is preserved.
MUST run before _import_celery_app().
"""
import shutil
src = os.path.join(_THIS_DIR, "fixtures", "global_llm_config.yaml")
dst = os.path.join(_BACKEND_ROOT, "app", "config", "global_llm_config.yaml")
if not os.path.exists(src):
raise RuntimeError(
f"E2E synthetic global LLM config fixture missing at {src!r}. "
f"Restore tests/e2e/fixtures/global_llm_config.yaml from VCS."
)
if os.path.exists(dst):
logger.info(
"[e2e-global-llm-config] %s already exists; leaving it alone "
"(local dev config preserved)",
dst,
)
return
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.copyfile(src, dst)
logger.info("[e2e-global-llm-config] installed %s -> %s", src, dst)
def _import_celery_app():
"""Import and return the production Celery app.
Every module under ``app.*`` (including all task modules) loads here,
creating their bindings. The LLM/embedding factories captured at this
point will be replaced by patches in _patch_llm_bindings() below.
"""
from app.celery_app import celery_app
return celery_app
def _patch_llm_bindings() -> None:
"""Replace LLM factories at every known binding site in worker tasks."""
from unittest.mock import patch
from tests.e2e.fakes.chat_llm import (
fake_create_chat_litellm_from_agent_config,
fake_create_chat_litellm_from_config,
)
from tests.e2e.fakes.llm import fake_get_user_long_context_llm
targets = [
"app.services.llm_service.get_user_long_context_llm",
"app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
@ -172,38 +242,93 @@ def _patch_llm_bindings() -> None:
)
_patch_llm_bindings()
_fake_embeddings.install(_active_patches)
_fake_confluence_oauth.install(_active_patches)
_fake_confluence_indexer.install(_active_patches)
_fake_native_google.install(_active_patches)
_fake_onedrive_graph.install(_active_patches)
_fake_dropbox_api.install(_active_patches)
_fake_notion_module.install(_active_patches)
_fake_linear_module.install(_active_patches)
_fake_jira_module.install(_active_patches)
_fake_clickup_module.install(_active_patches)
_fake_mcp_runtime.install(_active_patches)
_fake_mcp_oauth_runtime.install(_active_patches)
_fake_slack_module.install(_active_patches)
def _install_runtime_fakes() -> None:
"""Run each fake's install() against the active patch stack."""
from tests.e2e.fakes import (
clickup_module as _fake_clickup_module,
confluence_indexer as _fake_confluence_indexer,
confluence_oauth as _fake_confluence_oauth,
docling_service as _fake_docling_service,
dropbox_api as _fake_dropbox_api,
embeddings as _fake_embeddings,
jira_module as _fake_jira_module,
linear_module as _fake_linear_module,
mcp_oauth_runtime as _fake_mcp_oauth_runtime,
mcp_runtime as _fake_mcp_runtime,
native_google as _fake_native_google,
notion_module as _fake_notion_module,
onedrive_graph as _fake_onedrive_graph,
slack_module as _fake_slack_module,
)
_fake_embeddings.install(_active_patches)
_fake_docling_service.install(_active_patches)
_fake_confluence_oauth.install(_active_patches)
_fake_confluence_indexer.install(_active_patches)
_fake_native_google.install(_active_patches)
_fake_onedrive_graph.install(_active_patches)
_fake_dropbox_api.install(_active_patches)
_fake_notion_module.install(_active_patches)
_fake_linear_module.install(_active_patches)
_fake_jira_module.install(_active_patches)
_fake_clickup_module.install(_active_patches)
_fake_mcp_runtime.install(_active_patches)
_fake_mcp_oauth_runtime.install(_active_patches)
_fake_slack_module.install(_active_patches)
# ---------------------------------------------------------------------------
# 5) Start the worker.
# ---------------------------------------------------------------------------
def _bootstrap():
"""Run the full E2E bootstrap and return the production Celery app.
Ordering is load-bearing:
1) Hijack composio + notion_client in sys.modules.
2) Load .env + set env defaults (app.config reads env on import).
3) Configure logging.
4) Materialise the synthetic global_llm_config.yaml so the worker's
view of GLOBAL_LLM_CONFIGS matches the API container.
5) Import production celery_app (which transitively imports the
now-faked external SDKs and reads the env defaults + YAML).
6) Patch LLM / embedding bindings at every consumer site.
7) Install runtime fakes for connectors and chat backends.
"""
_hijack_external_sdks()
_load_dotenv_and_set_env_defaults()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger.warning(
"*** SURFSENSE E2E CELERY WORKER — fake Composio + LLM + embeddings ***"
)
_install_synthetic_global_llm_config()
celery_app = _import_celery_app()
_patch_llm_bindings()
_install_runtime_fakes()
return celery_app
celery_app = _bootstrap()
def _main() -> None:
# Default queues mirror production (default queue + connectors queue
# so Drive indexing tasks are picked up).
queue_name = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
queues = f"{queue_name},{queue_name}.connectors"
# macOS forks-after-MPS-init crash prefork workers; threads avoid it.
default_pool = "threads" if sys.platform == "darwin" else "prefork"
pool = os.getenv("CELERY_POOL", default_pool)
concurrency = os.getenv("CELERY_CONCURRENCY", "2")
celery_app.worker_main(
argv=[
"worker",
"--loglevel=info",
f"--queues={queues}",
"--concurrency=2",
f"--pool={pool}",
f"--concurrency={concurrency}",
"--without-gossip",
"--without-mingle",
]

View file

@ -741,6 +741,372 @@ async def test_extract_image_falls_back_to_document_without_vision_llm(
assert result.content_type == "document"
# ---------------------------------------------------------------------------
# Document path with vision LLM: per-image descriptions are appended
# ---------------------------------------------------------------------------
def _fake_extraction_result(*descriptions):
from app.etl_pipeline.picture_describer import (
PictureDescription,
PictureExtractionResult,
)
return PictureExtractionResult(
descriptions=[
PictureDescription(
page_number=d["page"],
ordinal_in_page=d.get("ordinal", 0),
name=d["name"],
sha256=d.get("sha", "deadbeef"),
description=d["desc"],
)
for d in descriptions
]
)
async def test_extract_pdf_with_vision_llm_inlines_image_blocks(tmp_path, mocker):
"""A PDF with an `<!-- image -->` placeholder + caption gets the
block spliced inline (no orphaned ``## Image Content`` section).
This is the headline scenario for the medxpertqa benchmark: the
image content lives in the same chunk as the surrounding case text
so retrieval pulls the question, image, and answer options together.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {
"content": (
"# MedXpertQA-MM MM-130\n\n"
"## Clinical case\n\nA 44-year-old man...\n\n"
"<!-- image -->\nImage: MM-130-a.jpeg\n\n"
"## Answer choices\n\nA) ...\n"
)
}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
extraction = _fake_extraction_result(
{
"page": 1,
"name": "Im0",
"desc": "Axial CT showing a large cystic mass.",
}
)
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(return_value=extraction),
)
fake_llm = mocker.MagicMock()
result = await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
md = result.markdown_content
# The placeholder + caption are gone, replaced by a horizontal-
# rule-delimited section with the captioned filename.
assert "<!-- image -->" not in md
assert "Image: MM-130-a.jpeg" not in md
assert "**Embedded image:** `MM-130-a.jpeg`" in md
assert "**Visual description:**" in md
assert "Axial CT showing a large cystic mass." in md
# No OCR section -- our fake_extraction_result has no ocr_text,
# and the format omits the section when there's no text to show.
assert "**OCR text:**" not in md
# No raw HTML / XML tags or blockquote wrapping leak.
assert "<image" not in md
assert "> **Embedded image:**" not in md
# No appended section -- everything went inline.
assert "## Image Content" not in md
# Surrounding case text + answer options are preserved.
assert "A 44-year-old man..." in md
assert "## Answer choices" in md
assert "A) ..." in md
async def test_extract_pdf_with_vision_llm_appends_when_no_marker(tmp_path, mocker):
"""When parser markdown has no image markers, descriptions get appended.
This is the fallback path for parsers that drop image placeholders
entirely. The image content still ends up in the markdown -- just
in a clearly-labeled section rather than inline.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {
"content": "# Parsed PDF text\n\nNo image markers anywhere.\n"
}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
extraction = _fake_extraction_result(
{"page": 1, "name": "Im0", "desc": "An image description."}
)
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(return_value=extraction),
)
fake_llm = mocker.MagicMock()
result = await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
md = result.markdown_content
assert "# Parsed PDF text" in md
assert "## Image Content (vision-LLM extracted)" in md
assert "**Embedded image:** `Im0`" in md
assert "An image description." in md
async def test_extract_pdf_without_vision_llm_skips_picture_descriptions(
tmp_path, mocker
):
"""No vision LLM -> parser markdown returned as-is."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
describe_mock = mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(),
)
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert result.markdown_content == "# Parsed PDF text"
assert "<image" not in result.markdown_content
describe_mock.assert_not_called()
async def test_extract_pdf_with_vision_llm_swallows_describe_failure(
tmp_path, mocker
):
"""A pypdf or vision LLM blow-up never fails the document upload."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(side_effect=RuntimeError("pypdf exploded")),
)
fake_llm = mocker.MagicMock()
result = await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert result.markdown_content == "# Parsed PDF text"
assert result.etl_service == "DOCLING"
async def test_extract_pdf_with_vision_llm_no_images_returns_parser_text(
tmp_path, mocker
):
"""Vision-LLM-enabled PDF with zero extracted images is unchanged."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Just text, no images"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
empty = _fake_extraction_result()
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(return_value=empty),
)
fake_llm = mocker.MagicMock()
result = await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert result.markdown_content == "# Just text, no images"
assert "<image" not in result.markdown_content
# ---------------------------------------------------------------------------
# Per-image OCR runner: wiring + behaviour
#
# When extracting a PDF with a vision LLM, the ETL service must ALSO
# pass an ``ocr_runner`` to picture_describer. The runner is a closure
# that re-feeds each extracted image through a vision-LLM-less
# EtlPipelineService -- i.e. the same OCR engine that handles
# standalone image uploads (Docling/Azure DI/LlamaCloud) gets a crack
# at each embedded image, with the text attached to the inline block.
# ---------------------------------------------------------------------------
async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(
tmp_path, mocker
):
"""The ETL service must wire an ocr_runner kwarg to describe_pictures."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
describe_mock = mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(return_value=_fake_extraction_result()),
)
fake_llm = mocker.MagicMock()
await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
describe_mock.assert_awaited_once()
_, kwargs = describe_mock.await_args
assert "ocr_runner" in kwargs
assert callable(kwargs["ocr_runner"])
async def test_extract_pdf_ocr_runner_invokes_document_parser_on_image(
tmp_path, mocker
):
"""The OCR runner closure should re-extract each image via the parser.
We capture the runner that the ETL service passes to
describe_pictures, invoke it with a fake image path, and assert
that Docling was called with that image. This proves the closure
is wired to a vision-LLM-less sub-pipeline (otherwise it would
recurse into the vision LLM and never hit the OCR engine).
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
image_file = tmp_path / "Im0.png"
image_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {
"content": "Slice 24 / 60 L R"
}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
captured: dict = {}
async def capture_runner(*args, **kwargs):
captured["runner"] = kwargs["ocr_runner"]
return _fake_extraction_result()
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=capture_runner,
)
fake_llm = mocker.MagicMock()
await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
runner = captured["runner"]
ocr_text = await runner(str(image_file), "Im0.png")
assert ocr_text == "Slice 24 / 60 L R"
# Docling was invoked twice in total: once for the PDF, once for
# the image we re-fed via the runner.
assert fake_docling.process_document.await_count == 2
async def test_extract_pdf_ocr_runner_returns_empty_on_unsupported_image(
tmp_path, mocker
):
"""Unsupported image format → runner returns empty string, doesn't raise.
Common case: a PDF embeds a JPEG2000 or CCITT-TIFF image that
Docling can't load. We don't want an unsupported format on ONE
embedded image to spoil the whole PDF extraction; the runner
should swallow the EtlUnsupportedFileError and return "" so the
image gets a description but no OCR tag.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
weird_image = tmp_path / "Im0.jp2" # JPEG2000, unlikely to be supported
weird_image.write_bytes(b"\x00\x00\x00\x0CjP" + b"\x00" * 50)
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
captured: dict = {}
async def capture_runner(*args, **kwargs):
captured["runner"] = kwargs["ocr_runner"]
return _fake_extraction_result()
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=capture_runner,
)
fake_llm = mocker.MagicMock()
await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
runner = captured["runner"]
ocr_text = await runner(str(weird_image), "Im0.jp2")
assert ocr_text == ""
# ---------------------------------------------------------------------------
# Processing Mode enum tests
# ---------------------------------------------------------------------------

View file

@ -0,0 +1,967 @@
"""Unit tests for the picture_describer module.
Covers:
- :func:`describe_pictures` -- the PDF image walker + per-image vision
LLM call (structured output split into ``ocr_text`` and
``description``);
- :func:`inject_descriptions_inline` -- in-place replacement of image
placeholders / captions in the parser markdown;
- :func:`merge_descriptions_into_markdown` -- the top-level helper
that inlines what it can and appends what it can't;
- :func:`render_appended_section` -- the appended-fallback renderer.
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from app.etl_pipeline.picture_describer import (
PictureDescription,
PictureExtractionResult,
describe_pictures,
inject_descriptions_inline,
merge_descriptions_into_markdown,
render_appended_section,
)
pytestmark = pytest.mark.unit
def _make_image_obj(name: str, data: bytes):
"""Mimic pypdf's ImageFile object shape for the bits we use."""
img = MagicMock()
img.name = name
img.data = data
return img
# ---------------------------------------------------------------------------
# describe_pictures: short-circuits
# ---------------------------------------------------------------------------
async def test_describe_pictures_no_op_for_non_pdf(tmp_path):
"""Non-PDF files are silently no-op'd; we don't try to extract images."""
docx_file = tmp_path / "report.docx"
docx_file.write_bytes(b"PK fake docx")
fake_llm = AsyncMock()
result = await describe_pictures(str(docx_file), "report.docx", fake_llm)
assert result.descriptions == []
assert result.skipped_too_large == 0
fake_llm.ainvoke.assert_not_called()
async def test_describe_pictures_no_op_when_vision_llm_is_none(tmp_path):
"""If the caller didn't provide a vision LLM, we no-op even for PDFs."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
result = await describe_pictures(str(pdf_file), "report.pdf", None)
assert result.descriptions == []
async def test_describe_pictures_no_op_for_pdf_with_no_images(tmp_path, mocker):
"""A PDF that pypdf can open but contains zero images returns empty."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[]), MagicMock(images=[])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
fake_llm = AsyncMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert result.descriptions == []
fake_llm.ainvoke.assert_not_called()
# ---------------------------------------------------------------------------
# describe_pictures: happy paths
# ---------------------------------------------------------------------------
async def test_describe_pictures_runs_vision_llm_per_image(tmp_path, mocker):
"""Every eligible image gets exactly one description-only vision call."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
page1 = MagicMock(images=[img_a])
page2 = MagicMock(images=[img_b])
fake_reader = MagicMock()
fake_reader.pages = [page1, page2]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
parse_mock = mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(side_effect=["Description A", "Description B"]),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 2
by_name = {d.name: d.description for d in result.descriptions}
assert by_name == {"Im0.jpeg": "Description A", "Im1.png": "Description B"}
assert all(d.page_number in (1, 2) for d in result.descriptions)
assert parse_mock.await_count == 2
async def test_describe_pictures_dedups_by_hash(tmp_path, mocker):
"""An image that appears N times in the PDF is described once."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
payload = b"\x89PNG\r\n\x1a\n" + b"\x42" * 2000
img = _make_image_obj("logo.png", payload)
page1 = MagicMock(images=[img])
page2 = MagicMock(images=[_make_image_obj("logo.png", payload)])
page3 = MagicMock(images=[_make_image_obj("logo.png", payload)])
fake_reader = MagicMock()
fake_reader.pages = [page1, page2, page3]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
parse_mock = mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="Logo desc"),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.skipped_duplicate == 2
assert parse_mock.await_count == 1
async def test_describe_pictures_skips_too_small_images(tmp_path, mocker):
"""Sub-1KB images (tracking pixels, dots, etc.) are skipped."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
tiny = _make_image_obj("dot.png", b"\x89PNG\r\n\x1a\n")
big = _make_image_obj("ct.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 3000)
page = MagicMock(images=[tiny, big])
fake_reader = MagicMock()
fake_reader.pages = [page]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
parse_mock = mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="CT scan"),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.descriptions[0].name == "ct.jpeg"
assert result.skipped_too_small == 1
assert parse_mock.await_count == 1
async def test_describe_pictures_skips_too_large_images(tmp_path, mocker):
"""Images larger than the vision LLM's per-image cap are skipped."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
huge = _make_image_obj("huge.jpeg", b"\xff" * (6 * 1024 * 1024))
ok = _make_image_obj("ok.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
page = MagicMock(images=[huge, ok])
fake_reader = MagicMock()
fake_reader.pages = [page]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
parse_mock = mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="OK image"),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.descriptions[0].name == "ok.jpeg"
assert result.skipped_too_large == 1
assert parse_mock.await_count == 1
async def test_describe_pictures_swallows_per_image_failure(tmp_path, mocker):
"""A vision LLM failure on one image must not kill the whole document."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img_a = _make_image_obj("a.jpeg", b"\xff\xd8" + b"\xab" * 2000)
img_b = _make_image_obj("b.jpeg", b"\xff\xd8" + b"\xcd" * 2000)
page = MagicMock(images=[img_a, img_b])
fake_reader = MagicMock()
fake_reader.pages = [page]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(side_effect=[RuntimeError("vision blew up"), "Success"]),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.descriptions[0].description == "Success"
assert result.failed == 1
async def test_describe_pictures_handles_pypdf_open_failure(tmp_path, mocker):
"""A malformed PDF that pypdf can't open returns an empty result."""
pdf_file = tmp_path / "broken.pdf"
pdf_file.write_bytes(b"not a pdf")
mocker.patch("pypdf.PdfReader", side_effect=ValueError("EOF marker not found"))
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "broken.pdf", fake_llm)
assert result.descriptions == []
# ---------------------------------------------------------------------------
# inject_descriptions_inline: replacement patterns
# ---------------------------------------------------------------------------
def _desc(name="Im0", description="A CT scan."):
return PictureDescription(
page_number=1,
ordinal_in_page=0,
name=name,
sha256="aa",
description=description,
)
def test_inject_no_op_when_no_descriptions():
markdown = "# Title\n\nbody text\n"
result = PictureExtractionResult()
out, n = inject_descriptions_inline(markdown, result)
assert out == markdown
assert n == 0
def test_inject_replaces_placeholder_with_caption():
"""`<!-- image -->` + `Image: <name>` together becomes one block.
This is the most common medxpertqa case: our renderer puts a caption
line right below the embedded JPEG, and Docling preserves both.
"""
markdown = (
"# Case\n\n"
"Clinical text...\n\n"
"<!-- image -->\nImage: MM-130-a.jpeg\n\n"
"Answer choices: A) ...\n"
)
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "<!-- image -->" not in out
assert "Image: MM-130-a.jpeg" not in out # caption consumed
# New format: horizontal-rule-delimited section with "Embedded
# image:" anchor and named "Visual description:" section. No
# blockquote wrapping -- nested blocks (lists, code, tables) inside
# a blockquote are silently dropped by Streamdown / remark.
assert "**Embedded image:** `MM-130-a.jpeg`" in out
assert "**Visual description:**" in out
assert "A CT scan." in out
# Block is delimited by horizontal rules so it stands out from
# surrounding paragraphs.
assert "\n---\n" in out
# No OCR section -- this fixture has no ocr_text on its descriptions.
assert "**OCR text:**" not in out
# No raw HTML tags / blockquote prefixes leak.
assert "<image" not in out
assert "</image>" not in out
assert "> **Embedded image:**" not in out # we no longer wrap in `>`
# Surrounding context is preserved.
assert "Clinical text..." in out
assert "Answer choices: A) ..." in out
def test_inject_uses_pypdf_name_when_no_caption():
"""`<!-- image -->` alone uses the pypdf-given name as the attribute."""
markdown = "# Case\n\n<!-- image -->\n\nMore text\n"
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "**Embedded image:** `Im0`" in out
def test_inject_replaces_bare_caption():
"""A bare `Image: <name>` line (no placeholder) still gets replaced."""
markdown = "# Case\n\nText...\nImage: scan.jpeg\nMore text\n"
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "**Embedded image:** `scan.jpeg`" in out
assert "Image: scan.jpeg" not in out
def test_inject_handles_multiple_images_in_order():
"""Two placeholders + two descriptions: each consumed in document order."""
markdown = (
"Page 1\n\n<!-- image -->\nImage: a.jpeg\n\n"
"Between\n\n<!-- image -->\nImage: b.jpeg\n\nEnd\n"
)
result = PictureExtractionResult(
descriptions=[
PictureDescription(
page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
description="Desc A",
),
PictureDescription(
page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
description="Desc B",
),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 2
assert "**Embedded image:** `a.jpeg`" in out
assert "**Embedded image:** `b.jpeg`" in out
assert out.index("a.jpeg") < out.index("b.jpeg")
assert "Desc A" in out and "Desc B" in out
def test_inject_returns_remaining_count_when_more_descriptions_than_markers():
"""Three descriptions, one marker -> only one inlined, two leftover."""
markdown = "Just one <!-- image --> here.\n"
result = PictureExtractionResult(
descriptions=[
_desc(name="Im0", description="First"),
_desc(name="Im1", description="Second"),
_desc(name="Im2", description="Third"),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "**Embedded image:** `Im0`" in out
assert "**Embedded image:** `Im1`" not in out
def test_inject_returns_zero_when_no_markers_present():
"""Markdown with no image markers at all returns the input unchanged."""
markdown = "# Title\n\nJust text. No images mentioned at all.\n"
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out, n = inject_descriptions_inline(markdown, result)
assert n == 0
assert out == markdown
# ---------------------------------------------------------------------------
# render_appended_section
# ---------------------------------------------------------------------------
def test_render_appended_empty_when_nothing_passed():
assert render_appended_section([]) == ""
def test_render_appended_renders_each_image_as_block():
descriptions = [
_desc(name="MM-130-a.jpeg", description="CT scan"),
_desc(name="MM-130-b.jpeg", description="Bar chart"),
]
rendered = render_appended_section(descriptions)
assert "## Image Content (vision-LLM extracted)" in rendered
assert "**Embedded image:** `MM-130-a.jpeg`" in rendered
assert "CT scan" in rendered
assert "**Embedded image:** `MM-130-b.jpeg`" in rendered
assert "Bar chart" in rendered
# Each image block is delimited by horizontal rules.
assert rendered.count("\n---\n") >= 2
# No raw HTML / XML / blockquote prefixes.
assert "<image" not in rendered
assert "> **Embedded image:**" not in rendered
assert "**OCR text:**" not in rendered
def test_render_appended_includes_skip_notes():
descriptions = [_desc()]
skip_result = PictureExtractionResult(
descriptions=descriptions,
skipped_too_small=2,
skipped_too_large=1,
skipped_duplicate=3,
failed=1,
)
rendered = render_appended_section(descriptions, skip_notes=skip_result)
assert "_Note:" in rendered
assert "2 too small" in rendered
assert "1 too large" in rendered
assert "3 duplicate" in rendered
assert "1 failed" in rendered
# ---------------------------------------------------------------------------
# merge_descriptions_into_markdown: top-level
# ---------------------------------------------------------------------------
def test_merge_inlines_when_marker_present():
markdown = "Text...\n\n<!-- image -->\nImage: scan.jpeg\n\nMore text\n"
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out = merge_descriptions_into_markdown(markdown, result)
assert "**Embedded image:** `scan.jpeg`" in out
# Nothing leaked into an appended section -- we should NOT see the
# appended-section heading because everything went inline.
assert "## Image Content" not in out
def test_merge_appends_when_no_marker_present():
"""Zero markers means everything goes into an appended section."""
markdown = "Pure text doc, no image markers.\n"
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="An image desc.")]
)
out = merge_descriptions_into_markdown(markdown, result)
assert "Pure text doc" in out
assert "## Image Content (vision-LLM extracted)" in out
assert "**Embedded image:** `Im0`" in out
def test_merge_appends_leftovers_with_distinct_heading():
"""One marker, two descriptions -> one inline, second appended under
a heading that signals it's a leftover.
"""
markdown = "Text\n\n<!-- image -->\nImage: a.jpeg\n\nEnd\n"
result = PictureExtractionResult(
descriptions=[
_desc(name="Im0", description="First"),
_desc(name="Im1", description="Second"),
]
)
out = merge_descriptions_into_markdown(markdown, result)
assert "**Embedded image:** `a.jpeg`" in out # inlined
assert "## Image Content (additional, no inline marker found)" in out
assert "**Embedded image:** `Im1`" in out # appended
# ---------------------------------------------------------------------------
# describe_pictures: ocr_runner integration
#
# These tests cover the per-image OCR side-channel: when the caller
# supplies an ``ocr_runner`` callable, each extracted image is sent
# both to the vision LLM (visual description) and to the OCR runner
# (text-in-image), in parallel. The OCR text -- if any -- is recorded
# on the PictureDescription and rendered in the inline block.
# ---------------------------------------------------------------------------
async def test_describe_pictures_calls_ocr_runner_per_image(tmp_path, mocker):
"""When an ocr_runner is provided, it's invoked once per eligible image."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img_a, img_b])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(side_effect=["Visual A", "Visual B"]),
)
ocr_runner = AsyncMock(side_effect=["OCR text A", "OCR text B"])
fake_llm = MagicMock()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
)
assert ocr_runner.await_count == 2
by_name = {d.name: d.ocr_text for d in result.descriptions}
assert by_name == {"Im0.jpeg": "OCR text A", "Im1.png": "OCR text B"}
async def test_describe_pictures_runs_vision_and_ocr_in_parallel(
tmp_path, mocker
):
"""Vision LLM and OCR run concurrently per image, not sequentially.
We verify this by recording call timestamps: if both finish within
a small window relative to the per-call sleep, they ran in parallel.
"""
import asyncio
import time
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
sleep_each = 0.05 # 50ms per call
async def slow_vision(*args, **kwargs):
await asyncio.sleep(sleep_each)
return "Visual"
async def slow_ocr(*args, **kwargs):
await asyncio.sleep(sleep_each)
return "OCR"
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=slow_vision,
)
fake_llm = MagicMock()
started = time.perf_counter()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=slow_ocr
)
elapsed = time.perf_counter() - started
assert len(result.descriptions) == 1
assert result.descriptions[0].ocr_text == "OCR"
# Sequential would be ~2*sleep_each. Parallel is ~1*sleep_each + overhead.
# Be generous with the bound so we're not flaky on slow CI.
assert elapsed < 1.5 * sleep_each, (
f"vision+OCR appear to be sequential (took {elapsed:.3f}s)"
)
async def test_describe_pictures_treats_empty_ocr_as_none(tmp_path, mocker):
"""Empty / whitespace-only OCR result is normalised to None.
This means the rendered image block won't carry an empty
"OCR text" section for images that contain no text at all
(e.g. a clean radiograph).
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="A radiograph."),
)
ocr_runner = AsyncMock(return_value=" \n \n")
fake_llm = MagicMock()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
)
assert len(result.descriptions) == 1
assert result.descriptions[0].ocr_text is None
async def test_describe_pictures_swallows_ocr_runner_failure(tmp_path, mocker):
"""An OCR runner exception must not kill the description for that image.
OCR is supplementary; the vision LLM's description is the primary
payload. If OCR blows up we drop the OCR field for that image and
keep the description.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="A radiograph."),
)
ocr_runner = AsyncMock(side_effect=RuntimeError("OCR backend down"))
fake_llm = MagicMock()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
)
assert len(result.descriptions) == 1
assert result.descriptions[0].description == "A radiograph."
assert result.descriptions[0].ocr_text is None
assert result.failed == 0 # the IMAGE didn't fail; only its OCR did
async def test_describe_pictures_vision_failure_with_ocr_runner_skips_image(
tmp_path, mocker
):
"""If the vision LLM fails, the image is skipped even if OCR succeeded.
The inline block's primary purpose is the visual description; an
OCR-only block would be misleading (it'd look like the vision
pipeline ran when it didn't), so we treat vision failure as image
failure regardless of OCR outcome.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(side_effect=RuntimeError("vision blew up")),
)
ocr_runner = AsyncMock(return_value="OCR text")
fake_llm = MagicMock()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
)
assert result.descriptions == []
assert result.failed == 1
async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(
tmp_path, mocker
):
"""Backward compat: omitting ocr_runner produces description-only blocks."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="Visual"),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.descriptions[0].ocr_text is None
# ---------------------------------------------------------------------------
# Rendering: "OCR text" section appears iff PictureDescription.ocr_text is set
# ---------------------------------------------------------------------------
def _desc_with_ocr(name="Im0", description="A CT scan.", ocr_text="L R 10mm"):
return PictureDescription(
page_number=1,
ordinal_in_page=0,
name=name,
sha256="aa",
description=description,
ocr_text=ocr_text,
)
def test_inject_renders_ocr_section_when_ocr_text_present():
markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
result = PictureExtractionResult(
descriptions=[_desc_with_ocr(name="Im0", ocr_text="L R 10mm")]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "**Embedded image:** `scan.jpeg`" in out
assert "**OCR text:**" in out
assert "L R 10mm" in out
# OCR section comes before the visual description (literal text
# first, interpretation second).
assert out.index("**OCR text:**") < out.index("**Visual description:**")
# Critical: no nested-block constructs (fenced code, blockquote)
# that previous formats relied on -- both broke in Streamdown /
# PlateJS by escaping their container and dropping content.
assert "```" not in out
assert "> **" not in out
def test_inject_renders_multiline_ocr_with_hard_breaks():
"""Multi-line OCR uses trailing-two-spaces hard breaks so each
line renders on its own row, without needing a fragile fenced
code block or blockquote wrapper."""
markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
ocr_multi = "Slice 24 / 60\nL\nR\n10 mm"
result = PictureExtractionResult(
descriptions=[_desc_with_ocr(name="Im0", ocr_text=ocr_multi)]
)
out, _ = inject_descriptions_inline(markdown, result)
# Every OCR line is present.
for line in ("Slice 24 / 60", "L", "R", "10 mm"):
assert line in out
# Non-last OCR lines get the trailing two-space hard break.
assert "Slice 24 / 60 \n" in out
assert "\nL \n" in out
assert "\nR \n" in out
# Last OCR line must NOT carry the two-space hard break (no stray <br>).
assert "10 mm \n" not in out
assert "10 mm\n" in out
def test_render_appended_renders_ocr_section_when_ocr_text_present():
descriptions = [
_desc_with_ocr(
name="MM-130-a.jpeg",
description="Axial CT.",
ocr_text="Slice 24 / 60",
),
]
rendered = render_appended_section(descriptions)
assert "**OCR text:**" in rendered
assert "Slice 24 / 60" in rendered
assert "Axial CT." in rendered
def test_render_omits_ocr_section_when_ocr_text_is_none():
descriptions = [_desc(name="Im0", description="A clean radiograph.")]
rendered = render_appended_section(descriptions)
assert "**Embedded image:** `Im0`" in rendered
assert "**OCR text:**" not in rendered
assert "**Visual description:**" in rendered
# No raw HTML / blockquote prefixes.
assert "<image" not in rendered
assert "> **" not in rendered
# ---------------------------------------------------------------------------
# inject_descriptions_inline: <figure> blocks (layout-aware parsers)
#
# Azure Document Intelligence's ``prebuilt-layout`` and LlamaCloud
# premium both emit ``<figure>...</figure>`` blocks that already contain
# the parser's own OCR of the figure (chart bar values, axis labels,
# inline ``<figcaption>``, embedded ``<table>`` for tabular figures).
# That parser-side content is useful for retrieval on its own, so we
# PRESERVE the figure verbatim and append our vision-LLM block
# immediately after rather than substituting for it.
# ---------------------------------------------------------------------------
def test_inject_appends_block_after_figure_preserving_parser_content():
"""Figure block stays intact; vision-LLM block goes right after it."""
markdown = (
"Some narrative text.\n\n"
"<figure>\n\n"
"Republican\n68\nDemocrat\n30\n"
"\n</figure>\n\n"
"Following paragraph.\n"
)
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="Bar chart of party ID.")]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
# Original figure is preserved verbatim -- the parser's OCR'd
# numbers must still be searchable.
assert "<figure>" in out
assert "</figure>" in out
assert "Republican" in out and "68" in out
# Our vision-LLM block follows the figure, not before / inside it.
assert "**Embedded image:** `Im0`" in out
assert "Bar chart of party ID." in out
figure_close = out.index("</figure>")
embedded_at = out.index("**Embedded image:** `Im0`")
assert figure_close < embedded_at, "block must be appended AFTER </figure>"
# Surrounding narrative is preserved.
assert "Some narrative text." in out
assert "Following paragraph." in out
def test_inject_handles_multiple_figures_in_document_order():
"""N figures + N descriptions: each pair lands in the right place."""
markdown = (
"Page 1\n\n<figure>\nChart A bars\n</figure>\n\n"
"Between\n\n<figure>\nChart B bars\n</figure>\n\n"
"End.\n"
)
result = PictureExtractionResult(
descriptions=[
PictureDescription(
page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
description="Description of chart A.",
),
PictureDescription(
page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
description="Description of chart B.",
),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 2
# Both figures preserved; both descriptions inlined; order matches.
assert out.count("<figure>") == 2
assert out.count("</figure>") == 2
assert "Description of chart A." in out
assert "Description of chart B." in out
assert out.index("Description of chart A.") < out.index(
"Description of chart B."
)
# Each description appears AFTER its corresponding </figure>.
first_close = out.index("</figure>")
assert first_close < out.index("Description of chart A.")
second_close = out.index("</figure>", first_close + 1)
assert second_close < out.index("Description of chart B.")
def test_inject_figures_with_attributes_and_nested_tags():
"""``<figure>`` with attributes and nested tags is matched and preserved."""
markdown = (
'<figure id="fig-3" class="chart">\n'
'<figcaption>Source: Pew Research</figcaption>\n'
"<table><tr><td>Republican</td><td>57</td></tr></table>\n"
"</figure>\n"
)
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="Survey table.")]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
# All nested HTML is preserved (chunking will pick it up).
assert 'id="fig-3"' in out
assert "<figcaption>Source: Pew Research</figcaption>" in out
assert "<table>" in out and "Republican" in out and "57" in out
# Our block sits after the closing tag.
assert out.index("</figure>") < out.index("**Embedded image:** `Im0`")
def test_inject_figures_more_descriptions_than_figures_returns_remaining():
"""Three descriptions, one figure -> one inlined, two left for caller."""
markdown = "Text.\n<figure>\nbar values\n</figure>\nMore.\n"
result = PictureExtractionResult(
descriptions=[
_desc(name="Im0", description="First desc."),
_desc(name="Im1", description="Second desc."),
_desc(name="Im2", description="Third desc."),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "First desc." in out
# Leftovers are the caller's job; inject_descriptions_inline does
# not append them on its own.
assert "Second desc." not in out
assert "Third desc." not in out
def test_inject_figures_more_figures_than_descriptions_leaves_extras_untouched():
"""Two figures, one description -> first figure enriched, second left raw."""
markdown = (
"<figure>\nfigure 1 content\n</figure>\n"
"<figure>\nfigure 2 content\n</figure>\n"
)
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="Only description.")]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
# Both figures still present; only the first one was enriched.
assert out.count("<figure>") == 2
assert "Only description." in out
# Second figure has no embedded-image block immediately after it.
second_open = out.index("<figure>", out.index("<figure>") + 1)
second_close = out.index("</figure>", second_open)
after_second = out[second_close:]
assert "**Embedded image:**" not in after_second
def test_merge_inlines_at_figure_boundary():
"""Top-level helper does the right thing with figures (no leftover section)."""
markdown = "Lead.\n<figure>\nbars\n</figure>\nTrailer.\n"
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="Bar chart.")]
)
out = merge_descriptions_into_markdown(markdown, result)
# Inline succeeded -> no appended-section heading.
assert "## Image Content" not in out
assert "Bar chart." in out
assert "<figure>" in out and "</figure>" in out
def test_inject_figures_then_falls_through_to_docling_marker():
"""Mixed-marker doc: figure consumed first, then Docling placeholder.
Defensive -- single docs are usually one parser's output, but if a
pipeline ever stitches two parsers' markdowns together the inliner
should still place each description.
"""
markdown = (
"<figure>\nChart bars: 50, 40, 30\n</figure>\n\n"
"Later in the doc:\n\n"
"<!-- image -->\nImage: scan.jpeg\n\n"
"End.\n"
)
result = PictureExtractionResult(
descriptions=[
_desc(name="Im0", description="Chart description."),
_desc(name="Im1", description="Scan description."),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 2
# Figure preserved + augmented.
assert "<figure>" in out and "Chart bars: 50, 40, 30" in out
assert "Chart description." in out
# Docling placeholder + caption replaced.
assert "<!-- image -->" not in out
assert "Image: scan.jpeg" not in out
assert "**Embedded image:** `scan.jpeg`" in out
assert "Scan description." in out

View file

@ -0,0 +1,146 @@
"""Unit tests for the vision_llm parser helpers.
Two helpers exist:
- :func:`parse_with_vision_llm` -- single-shot for standalone image
uploads (.png/.jpg/etc). Returns combined markdown (description +
verbatim OCR mixed) since the image *is* the document.
- :func:`parse_image_for_description` -- per-image-in-PDF call. Returns
visual description only; OCR is the ETL service's job.
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# parse_with_vision_llm: legacy single-shot path
# ---------------------------------------------------------------------------
async def test_parse_with_vision_llm_returns_combined_markdown(tmp_path):
"""Standalone image uploads still go through the combined-markdown path."""
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = "# A scan of something."
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
out = await parse_with_vision_llm(str(img), "scan.png", fake_llm)
assert out == "# A scan of something."
fake_llm.ainvoke.assert_awaited_once()
async def test_parse_with_vision_llm_rejects_empty_response(tmp_path):
"""An empty model response raises rather than silently returning blanks."""
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = ""
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
with pytest.raises(ValueError, match="empty content"):
await parse_with_vision_llm(str(img), "scan.png", fake_llm)
# ---------------------------------------------------------------------------
# parse_image_for_description: per-image-in-PDF, description only
# ---------------------------------------------------------------------------
async def test_parse_image_for_description_returns_description(tmp_path):
"""Description-only path returns the model's markdown unchanged."""
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = "Axial CT showing a large cystic mass."
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
out = await parse_image_for_description(str(img), "scan.png", fake_llm)
assert out == "Axial CT showing a large cystic mass."
async def test_parse_image_for_description_uses_description_only_prompt(tmp_path):
"""The prompt explicitly tells the model NOT to transcribe text.
This is the contract that lets us drop OCR from the response: the
ETL pipeline already has the text (from page-level OCR), so asking
the vision LLM for it would be redundant cost.
"""
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = "A description"
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
await parse_image_for_description(str(img), "scan.png", fake_llm)
# The prompt is the first text part of the message we sent.
sent_messages = fake_llm.ainvoke.call_args.args[0]
prompt_text = sent_messages[0].content[0]["text"].lower()
assert "describe what this image visually depicts" in prompt_text
assert "do not transcribe text" in prompt_text
async def test_parse_image_for_description_rejects_empty(tmp_path):
"""Empty response surfaces as ValueError so the caller can skip the image."""
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = " " # whitespace-only counts as empty
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
with pytest.raises(ValueError, match="empty content"):
await parse_image_for_description(str(img), "scan.png", fake_llm)
# ---------------------------------------------------------------------------
# Image size + extension validation (shared by both paths)
# ---------------------------------------------------------------------------
def test_image_to_data_url_rejects_oversized(tmp_path):
"""Images larger than 5 MB raise before any LLM call is made."""
from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
big = tmp_path / "huge.png"
big.write_bytes(b"\x89PNG" + b"\x00" * (6 * 1024 * 1024))
with pytest.raises(ValueError, match="Image too large"):
_image_to_data_url(str(big))
def test_image_to_data_url_rejects_unsupported_extension(tmp_path):
"""Unknown extensions raise rather than guessing a MIME type."""
from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
weird = tmp_path / "scan.xyz"
weird.write_bytes(b"\x00" * 100)
with pytest.raises(ValueError, match="Unsupported image extension"):
_image_to_data_url(str(weird))

View file

@ -21,7 +21,7 @@
"email": "rohan@surfsense.com"
},
"license": "MIT",
"packageManager": "pnpm@10.24.0",
"packageManager": "pnpm@10.26.0",
"devDependencies": {
"@electron/rebuild": "^4.0.3",
"@types/node": "^25.5.0",

View file

@ -0,0 +1,83 @@
# surfsense_evals — environment template.
#
# Copy this file to `.env` (in the surfsense_evals/ project root or your
# CWD) and fill in the values. `python-dotenv` loads it automatically
# the first time `core.config` is imported, so every CLI subcommand
# (`setup`, `ingest`, `run`, `report`, `teardown`, `models list`, …)
# will pick the values up.
#
# cp .env.example .env
# # then edit .env with your values
#
# `.env` is gitignored — never commit real secrets.
# ---------------------------------------------------------------------------
# 1. Backend target — REQUIRED (default works for a local dev backend)
# ---------------------------------------------------------------------------
SURFSENSE_API_BASE=http://localhost:8000
# ---------------------------------------------------------------------------
# 2. OpenRouter — REQUIRED for any `run` invocation
# ---------------------------------------------------------------------------
# The `native_pdf` arm calls OpenRouter directly; the `surfsense` arm
# routes through SurfSense which uses the same key under the hood.
OPENROUTER_API_KEY=sk-or-...
# Override only if you proxy OpenRouter through a private gateway:
# OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
# Multimodal benchmarks (medxpertqa, mmlongbench) require a vision-capable
# slug. Recommended (verify in your catalog with `models list --grep ...`):
# anthropic/claude-sonnet-4.5 (default recommendation)
# anthropic/claude-opus-4.7 (strongest)
# openai/gpt-5 (top-tier vision)
# google/gemini-2.5-pro (1M-token context, best for long PDFs)
# DO NOT use openai/gpt-5.4-mini for image-bearing benchmarks — it's
# text-only on PDF content and the runner emits a warning if pinned.
# ---------------------------------------------------------------------------
# 3. Auth — pick EXACTLY ONE of the two modes below
# ---------------------------------------------------------------------------
# --- Mode A: LOCAL (backend started with AUTH_TYPE=LOCAL)
# The harness POSTs these to /auth/jwt/login automatically.
# SURFSENSE_USER_EMAIL=you@example.com
# SURFSENSE_USER_PASSWORD=...
# --- Mode B: GOOGLE OAuth (or any pre-issued JWT)
# Open the SurfSense web UI in your browser, log in via Google, then in
# DevTools → Application → Local Storage copy:
# surfsense_bearer_token → SURFSENSE_JWT
# surfsense_refresh_token → SURFSENSE_REFRESH_TOKEN (optional, enables
# auto-refresh on 401)
# SURFSENSE_JWT=eyJhbGciOi...
# SURFSENSE_REFRESH_TOKEN=eyJhbGciOi...
# ---------------------------------------------------------------------------
# 4. Filesystem paths — OPTIONAL (defaults below)
# ---------------------------------------------------------------------------
# Where datasets, rendered PDFs, ingestion id maps, run outputs, and
# state.json live. Default: <surfsense_evals>/data/
# EVAL_DATA_DIR=./data
# Where generated reports (summary.md / summary.json) get written.
# Default: <surfsense_evals>/reports/
# EVAL_REPORTS_DIR=./reports
# ---------------------------------------------------------------------------
# 5. Parser SDKs — REQUIRED for the multimodal_doc / parser_compare suite
# ---------------------------------------------------------------------------
# parser_compare calls Azure Document Intelligence and LlamaParse SDKs
# directly from the eval harness so each (basic / premium) extraction
# is a clean A/B test independent of the SurfSense backend's ETL routing.
#
# Azure Document Intelligence — used for the `azure_basic_lc` (prebuilt-read)
# and `azure_premium_lc` (prebuilt-layout) arms. Get an endpoint + key from
# https://portal.azure.com (Document Intelligence resource, F0 / S0 tier).
# AZURE_DI_ENDPOINT=https://<your-resource>.cognitiveservices.azure.com/
# AZURE_DI_KEY=<your-32-char-key>
#
# LlamaCloud (LlamaParse) — used for `llamacloud_basic_lc` (parse_page_with_llm)
# and `llamacloud_premium_lc` (parse_page_with_agent). Get a key from
# https://cloud.llamaindex.ai/api-key.
# LLAMA_CLOUD_API_KEY=llx-...

35
surfsense_evals/.gitignore vendored Normal file
View file

@ -0,0 +1,35 @@
# Python bytecode + caches
__pycache__/
*.py[cod]
*.pyo
# Editable-install / build artifacts
*.egg-info/
build/
dist/
.eggs/
# Virtual envs (uv venv default + common alternates)
.venv/
venv/
env/
# Tooling caches
.pytest_cache/
.ruff_cache/
.mypy_cache/
.coverage
.coverage.*
htmlcov/
# Local secrets — keep `.env.example` tracked, never the real `.env`.
.env
.env.local
.env.*.local
!.env.example
# Run / debug logs — keep ephemeral, structured artifacts in
# `data/.../<run_id>/` are the citation surface, not these.
*.log
logs_*.txt
retry_run.log

228
surfsense_evals/README.md Normal file
View file

@ -0,0 +1,228 @@
# SurfSense Evals
Domain-agnostic eval harness for SurfSense. Each benchmark is a Python subpackage under `suites/<domain>/<benchmark>/` that self-registers with the CLI; `core/` is the shared infrastructure (HTTP clients, arms, parsers, metrics, report writer, registry). The harness talks to SurfSense over HTTP only — it does **not** import any backend Python module — so it ships in its own venv and never bloats the FastAPI runtime image.
## Benchmarks
| Benchmark | Shape | Vision required? | Default ingest |
|---------------------------------|--------------------------------------------------|------------------|----------------------------|
| `medical/medxpertqa` (headline) | Native PDF vs SurfSense head-to-head, MCQ | yes | `vision=on, mode=basic` |
| `medical/mirage` | SurfSense single-arm, MCQ | no | `vision=off, mode=basic` |
| `medical/cure` | SurfSense single-arm retrieval (Recall/MRR/nDCG) | no | `vision=off, mode=basic` |
| `multimodal_doc/mmlongbench` | Native PDF vs SurfSense head-to-head, open-ended | yes | `vision=on, mode=basic` |
Future domains (`legal/`, `finance/`, `code/`, `scientific/`) drop into `suites/` without touching `core/` or the CLI.
## Install + auth
```bash
uv pip install -e ./surfsense_evals
cp surfsense_evals/.env.example surfsense_evals/.env
# Edit .env: SURFSENSE_API_BASE, OPENROUTER_API_KEY, and ONE of:
# LOCAL → SURFSENSE_USER_EMAIL + SURFSENSE_USER_PASSWORD
# GOOGLE → SURFSENSE_JWT (+ optional SURFSENSE_REFRESH_TOKEN)
# (lift both from browser localStorage after a normal Google login)
```
## Step-by-step: run all four benchmarks
The medical and multimodal_doc suites each get their own SearchSpace and pinned model, so they're independent — run them in any order. Both head-to-head benchmarks (`medxpertqa`, `mmlongbench`) require a **vision-capable** OpenRouter slug; pinning a text-only one (e.g. `openai/gpt-5.4-mini`) silently drops images and the runner emits a warning.
Recommended vision slugs (use `models list --grep <name>` to confirm one): `anthropic/claude-sonnet-4.5` (balanced cost), `anthropic/claude-opus-4.7` (strongest reasoning), `openai/gpt-5` (top-tier vision), `google/gemini-2.5-pro` (best for long PDFs, 1M-token context).
```bash
# 0. (optional) discover what's registered
python -m surfsense_evals suites list
python -m surfsense_evals benchmarks list
# 1. MEDICAL SUITE — one SearchSpace, three benchmarks
python -m surfsense_evals setup --suite medical --provider-model anthropic/claude-sonnet-4.5
# 1a. headline head-to-head: Native PDF (vision) vs SurfSense (vision RAG)
# Downloads dev+test JSONL + images.zip, renders one PDF per question
# (case + table + images + 5 options), uploads with use_vision_llm=True.
python -m surfsense_evals ingest medical medxpertqa --split test
python -m surfsense_evals run medical medxpertqa --concurrency 4
# 1b. MIRAGE — single-arm SurfSense MCQ accuracy
# (MMLU-Med / MedQA-US / MedMCQA / PubMedQA / BioASQ)
python -m surfsense_evals ingest medical mirage
python -m surfsense_evals run medical mirage
# 1c. CUREv1 — single-arm SurfSense retrieval (Recall@k / MRR / nDCG@10)
python -m surfsense_evals ingest medical cure --lang en
python -m surfsense_evals run medical cure --lang en
# 1d. write reports/medical/<UTC-ts>/summary.{md,json}
python -m surfsense_evals report --suite medical
# 2. MULTIMODAL_DOC SUITE — long PDFs with embedded images, charts, tables
python -m surfsense_evals setup --suite multimodal_doc --provider-model google/gemini-2.5-pro
python -m surfsense_evals ingest multimodal_doc mmlongbench # ~660MB, resumable
python -m surfsense_evals run multimodal_doc mmlongbench --concurrency 4
python -m surfsense_evals report --suite multimodal_doc
# 3. CLEANUP — soft-deletes the SearchSpaces; rendered PDFs stay cached
python -m surfsense_evals teardown --suite medical
python -m surfsense_evals teardown --suite multimodal_doc
```
## Asymmetric scenarios — the "vision-extract once, answer cheap" play
The walkthrough above is `--scenario head-to-head` (default): both arms answer with the same vision-capable slug. SurfSense's actual architectural value-prop is that the **ingestion-time vision LLM and the runtime LLM are completely independent** — you can pay a vision LLM *once*, at ingest, to convert every embedded image into text (per-image OCR **and** semantic description, inlined where the image actually appears in the document — see [What `--use-vision-llm` produces](#what---use-vision-llm-produces) below). Then every query is served by a cheap text-only model that sees that extracted text natively. Two extra scenarios make this explicit:
| `--scenario` | Native arm answers with | SurfSense arm answers with | Question being measured |
|--------------------|----------------------------------------|--------------------------------|------------------------------------------------------------------------------------------|
| `head-to-head` | `--provider-model` (vision) | `--provider-model` (vision) | Pure RAG quality at parity. (Default.) |
| `symmetric-cheap` | `--provider-model` (cheap, text-only) | `--provider-model` (same) | Does pre-extracted image context let a non-vision LLM reason over image-heavy docs? |
| `cost-arbitrage` | `--native-arm-model` (vision) | `--provider-model` (cheap) | How close does SurfSense get to a vision-native baseline at a fraction of per-query cost?|
In all three modes the **ingest-time** vision LLM is set on the SearchSpace's `vision_llm_config_id` (auto-picked from the strongest registered global OpenRouter vision config — `claude-sonnet-4.5` > `claude-opus-4.7` > `gpt-5` > `gemini-2.5-pro`, override with `--vision-llm <slug>`). What changes is which slug the *answering* models hit per arm.
### Ingest with vision, evaluate with a non-vision LLM (`symmetric-cheap`)
This is the answer to *"does SurfSense give a non-vision LLM enough context to reason over image-heavy docs?"*. Both arms hit the same cheap text-only slug. The native arm is structurally blind to images (text-only LLM + raw PDFs). The SurfSense arm reads chunks that already contain the per-image OCR and visual descriptions, written there by the vision LLM at ingest time.
```bash
python -m surfsense_evals setup --suite medical \
--scenario symmetric-cheap \
--provider-model openai/gpt-5.4-mini
# vision LLM at ingest = auto-picked (claude-sonnet-4.5 by default)
# answer LLM for BOTH arms = openai/gpt-5.4-mini (text-only)
python -m surfsense_evals ingest medical medxpertqa --split test # vision=on by default
python -m surfsense_evals run medical medxpertqa --concurrency 4
python -m surfsense_evals report --suite medical
# Δ accuracy on image-required MCQs is the headline number; native arm
# baseline is "what a text-only LLM gets without seeing the images".
```
### Cheap SurfSense vs vision-native baseline (`cost-arbitrage`)
```bash
python -m surfsense_evals setup --suite medical \
--scenario cost-arbitrage \
--provider-model openai/gpt-5.4-mini \
--native-arm-model anthropic/claude-sonnet-4.5
# vision LLM at ingest = auto-picked claude-sonnet-4.5
# native arm = sonnet (vision); SurfSense arm = gpt-5.4-mini (text-only)
python -m surfsense_evals ingest medical medxpertqa --split test
python -m surfsense_evals run medical medxpertqa --concurrency 4
python -m surfsense_evals report --suite medical
# Report header reads:
# Scenario: cost-arbitrage — native arm answers with `anthropic/claude-sonnet-4.5`
# (vision); SurfSense answers with `openai/gpt-5.4-mini` over chunks vision-extracted
# at ingest by `anthropic/claude-sonnet-4.5`.
```
Notes:
- `cost-arbitrage` requires both `--provider-model` (the cheap SurfSense slug) AND `--native-arm-model <vision slug>`.
- `--vision-llm <slug>` is optional; if omitted the harness queries `GET /api/v1/global-vision-llm-configs` and auto-picks the strongest registered one. Pass `--no-vision-llm-setup` if you want to keep whatever vision config is already attached to the SearchSpace.
- The runner's "looks text-only" warning is suppressed (or relabelled as informational) for `symmetric-cheap` so intentional asymmetry doesn't read as a misconfiguration.
- All three scenario fields (`scenario`, `provider_model`, `native_arm_model`, `vision_provider_model`) are persisted to `state.json` and recorded in `run_artifact.extra` + the report header — no need to retrace what was set.
## Per-benchmark useful flags
`medical/medxpertqa` (`run`):
- `--split {test,dev,all}` — pick a subset (default `test`)
- `--task "Diagnosis"` / `--body-system "Cardiovascular"` — slice the report
- `--require-images` — drop rare rows where every image filename failed to resolve
- `--n 100` — quick smoke run
- `--no-mentions` — let SurfSense retrieve unscoped ("did the @-mention matter?")
`multimodal_doc/mmlongbench`:
- `--max-docs N` (ingest) — cap downloads at the first N unique PDFs
- `--format {str,int,float,list,none}` (run) — slice by answer format; `none` = the ~22% intentionally unanswerable hallucination probes
- `--skip-unanswerable` (run) — drop unanswerable questions
- `--docs <a.pdf>,<b.pdf>` (run) — scope to specific docs
## Ingestion knobs (vision LLM, processing mode, summarize)
The harness exposes `POST /api/v1/documents/fileupload`'s three knobs on every `ingest` subcommand:
| Flag pair | Effect |
|--------------------------------------------|-----------------------------------------------------------------------------------------|
| `--use-vision-llm` / `--no-vision-llm` | Walk every embedded image in the PDF and inline image-derived text at the image's position (see below). |
| `--processing-mode {basic,premium}` | `premium` carries a 10× page multiplier and routes to a stronger ETL (e.g. LlamaCloud). |
| `--should-summarize` / `--no-summarize` | Generate a per-document summary at ingest. |
The "Default ingest" column in the benchmarks table is what runs if you don't pass any flag. Whatever was actually used is recorded as a `__settings__` header in the doc map (`data/<suite>/maps/<benchmark>_*_map.jsonl`) and as `extra.ingest_settings` in `run_artifact.json`, then surfaced in the report — no need to hunt through CLI history.
> The backend's `ETL_SERVICE` env var (`DOCLING` | `UNSTRUCTURED` | `LLAMACLOUD`) is **not** per-upload. Restart the backend with a different `ETL_SERVICE` and re-ingest to compare ETLs (route through `--processing-mode premium` if your backend uses that mode for the stronger ETL).
### What `--use-vision-llm` produces
When vision is on, the backend's ETL pipeline (`app/etl_pipeline/picture_describer.py`) does, **per embedded image** in the PDF:
1. Extract the raw image bytes via `pypdf` (deduped by sha256, size-capped to match the vision LLM's per-image limit).
2. **Per-image OCR** — re-feed the image as a standalone upload through the configured ETL service (Docling / Azure DI / LlamaCloud) with `vision_llm=None`, so the ETL's OCR engine extracts the literal text-in-image.
3. **Visual description** — call the vision LLM on the image with a description-only prompt (it's explicitly told *not* to transcribe text — that's OCR's job). Steps 2 and 3 run in parallel per image.
4. Splice a horizontal-rule-delimited section **at the image's original position** in the parser markdown (replacing Docling's `<!-- image -->` placeholder + caption, or the bare `Image: <name>` caption a stripped-image parser leaves behind):
```markdown
---
**Embedded image:** `MM-130-a.jpeg`
**OCR text:**
Slice 24 / 60
L R
**Visual description:**
- Axial contrast-enhanced CT showing a large cystic mass in the left upper quadrant.
- Mass effect on the adjacent stomach; left kidney displaced inferiorly.
---
```
This is what makes `--scenario symmetric-cheap` and `--scenario cost-arbitrage` work: a non-vision LLM reading SurfSense's chunks sees the image's text and semantic content as plain markdown, alongside the surrounding case text, in the same retrieved chunk. Without it the cheap LLM would have nothing extra to read.
### A/B testing the same corpus with different settings
SurfSense dedupes uploads by `(filename, search_space_id)`**not** by content hash and **not** by ingestion settings. Re-uploading the same filename to the same SearchSpace with a different `--use-vision-llm` flag silently skips re-processing. Give each variant its own SearchSpace:
```bash
# Baseline arm (vision off)
python -m surfsense_evals setup --suite medical --provider-model anthropic/claude-sonnet-4.5
python -m surfsense_evals ingest medical medxpertqa --no-vision-llm
python -m surfsense_evals run medical medxpertqa --n 100
python -m surfsense_evals teardown --suite medical
# Vision arm (the benchmark default)
python -m surfsense_evals setup --suite medical --provider-model anthropic/claude-sonnet-4.5
python -m surfsense_evals ingest medical medxpertqa
python -m surfsense_evals run medical medxpertqa --n 100
python -m surfsense_evals report --suite medical
```
Both runs land in `data/medical/runs/<ts>/medxpertqa/` with their settings recorded; rendered PDFs stay cached under `data/medical/medxpertqa/pdfs/` so the second `ingest` is upload-only.
## Environment variables
- `SURFSENSE_API_BASE` (default `http://localhost:8000`)
- `OPENROUTER_API_KEY` — required for the `native_pdf` arm and for `models list`
- One of `SURFSENSE_USER_EMAIL` + `SURFSENSE_USER_PASSWORD` (LOCAL), **or** `SURFSENSE_JWT` (+ optional `SURFSENSE_REFRESH_TOKEN`) for GOOGLE/pre-issued JWT
- `EVAL_DATA_DIR` (default `<project>/data`) — datasets, rendered PDFs, ingestion id maps, run outputs, `state.json`
- `EVAL_REPORTS_DIR` (default `<project>/reports`)
- `OPENROUTER_BASE_URL` (default `https://openrouter.ai/api/v1`) — only if you proxy OpenRouter
## Adding a new domain suite
1. Create `surfsense_evals/src/surfsense_evals/suites/<domain>/<benchmark>/` with `__init__.py`, `ingest.py`, `runner.py`, optional `prompt.py`.
2. Implement a `Benchmark` subclass (see `core/registry.py`); compose with `core.clients.*`, `core.arms.*`, `core.parse.*`, `core.metrics.*`.
3. Call `register(MyBenchmark())` at the bottom of `<benchmark>/__init__.py`. Auto-discovery picks it up; `setup --suite <domain>` and `ingest/run <domain> <benchmark>` work immediately.
Each suite gets its own SearchSpace (`eval-<suite>-<UTC-ts>`), `state.json` slot, data dir, reports dir, and pinned LLM. Suites never share a SearchSpace.
## Out of scope (follow-up PRs)
- Docker service for `docker compose run evals run medical medxpertqa`.
- Multi-model sweeps (one slug per `setup` for now; aggregate reports come later).
- A long-context-stuffing arm (give the model the same retrieved chunks SurfSense saw).
- LLM-judge grader for MMLongBench-Doc (paper uses GPT-4 as judge; we ship a deterministic rule-based grader).
- MedXpertQA-MM accuracy by image modality — dataset doesn't tag modality directly; we slice by `medical_task` and `body_system`.
- A `--slot <name>` flag that decouples the state-slot key from the benchmark registry's `suite` attribute, so parallel SearchSpaces with different ingestion settings can coexist on the same benchmark without `teardown` between A/B arms.
See `c:/Users/91882/.cursor/plans/medical_rag_evals_(mirage_+_curev1)_e797a324.plan.md` for the full design rationale.

22
surfsense_evals/data/.gitignore vendored Normal file
View file

@ -0,0 +1,22 @@
# Default: don't track anything under data/ (large benchmarks, PDFs,
# extracted markdown, ingestion caches, per-run artifacts can balloon).
*
# Always keep this gitignore file
!.gitignore
# Whitelist the artifacts the n=171 multimodal_doc / parser_compare blog
# report (`reports/blog/multimodal_doc_parser_compare_n171_report.md`)
# cites by path. These are the verifiable numbers source for the report.
#
# Path components have to be unblocked one level at a time because git
# does not look into a directory whose parent is ignored.
!multimodal_doc/
!multimodal_doc/runs/
!multimodal_doc/runs/2026-05-14T00-53-19Z/
!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/
!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,37 @@
{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:15Z", "latency_ms": 20985, "error": null, "raw_text_chars": 696}]}, "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about how many adults rated Trump's government ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nLooking through the document, I find relevant information on page 4 (document page 6):\n\nThe document states: \"About four-in-ten Americans rate the ethical standards of Trump administration officials as excellent (9%) or good (30%), while 58% say they are not good (21%) or poor (36%).\"\n\nThe survey was conducted among 1,503 adults (as stated in the methodology section on page 14).\n\nTo find the number of adults who rated the ethical standards as \"poor\":\n36% of 1,503 = 0.36 \u00d7 1,503 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 33894, "output_tokens": 191, "cost_micros": 104547, "latency_ms": 20968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:16Z", "latency_ms": 20453, "error": null, "raw_text_chars": 544}]}, "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at page 5 of the document, which shows \"Public's confidence in Trump to handle several policy areas mixed,\" I can see the data for \"very confident\" and \"not at all confident\" percentages across different domains.\n\nFor \"Very confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 30%\n\nFor \"Not at all confident\" (highest percentage):\n- Make wise decisions about immigration policy: 55%\n\nAnswer: Make wise decisions about immigration policy, Negotiate favorable trade agreements with other countries", "citations": [], "input_tokens": 33942, "output_tokens": 117, "cost_micros": 103581, "latency_ms": 20438, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, negotiate favorable trade agreements with other countries", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 41750, "error": null, "raw_text_chars": 1479}]}, "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I need to count the pictures in the document that contain only one person.\n\nLet me go through the document page by page:\n\nPage 1:\n- Photo of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. (Ret.) Tim Ziemer - 2 people\n\nPage 2:\n- Photo of Capt. John W. Sanders - 1 person\n- Photo of Capt. Steven M. Wechsler - 1 person\n\nPage 3:\n- Photo of Afghan trainees in laboratory - multiple people\n\nPage 4:\n- Group photo of Pacific Partnership 2012 Internal Medicine Department staff - multiple people\n- Photo of Cmdr. Charmagne Beckett on flight deck - 1 person\n\nPage 5:\n- Photo of Petty Officer 1st Class Dennis Gonzales - 1 person\n\nPage 6:\n- Photo of Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n\nPage 7:\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Buhari Oyofo, Dr. Walter T. Gwenigale, Lt. Joseph Diclaro, and Dr. Fatorma Bolay - 5 people\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Chris Martinez, Capt. Buhari Oyofo, Col. Vernon Graham, and Lt. Joseph Diclaro - 5 people\n\nPage 8:\n- Group photo of Kazakh scientists and NMRC staff - multiple people\n\nPage 9:\n- Photo of Lt. j.g. Michael Rucker treating a child - 2 people\n- Photo of U.S. Marines and Sailors in aircraft - multiple people\n\nPage 10:\n- Group photo of Joint Combat Casualty Research Team - multiple people\n\nPage 12:\n- Photo of NMRC 2012 Dining Out Event - multiple people\n- Photo with Rear Adm. Bruce Doll, Capt. John Sanders and others - multiple people\n\nPage 13:\n- Photo with Ellen Flannery, Emmanuel Adu-Gyamfi, Eil", "citations": [], "input_tokens": 40387, "output_tokens": 512, "cost_micros": 128841, "latency_ms": 41735, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "original_error": "JSONDecodeError: Expecting value: line 169 column 1 (char 924)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 36889, "error": null, "raw_text_chars": 1214}]}, "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure, specifically the number of multi-head self-attention layers.\n\nLet me search through the document for information about DETR's architecture.\n\nFrom page 2, I can see that DETR has a transformer encoder-decoder architecture.\n\nOn page 10 (Section 4.1), I find: \"To be comparable in the number of parameters we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nLooking at the detailed architecture description on page 22 (Section A.3) and Figure 10, I can see that:\n- The encoder has multiple layers, each containing multi-head self-attention\n- The decoder has multiple layers, each containing multi-head self-attention AND multi-head attention (cross-attention)\n\nFrom page 9-10, the baseline model uses \"6 encoder, 6 decoder layers\"\n\nEach encoder layer has one multi-head self-attention layer.\nEach decoder layer has one multi-head self-attention layer and one multi-head cross-attention layer.\n\nSo the total number of multi-head self-attention layers (not counting cross-attention) is:\n- 6 encoder layers \u00d7 1 self-attention per layer = 6\n- 6 decoder layers \u00d7 1 self-attention per layer = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 59866, "output_tokens": 318, "cost_micros": 184368, "latency_ms": 36875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "arm": "llamacloud_basic_lc", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:13Z", "latency_ms": 11328, "error": null, "raw_text_chars": 1376}]}, "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine the document for bar plots.\n\nLooking through the figures in the document:\n\n1. **Figure 1**: Shows example images with text descriptions - no bar plots\n2. **Figure 2**: Shows data format comparison with images and text - no bar plots\n3. **Figure 3**: Shows the Syphus overview diagram - no bar plots\n4. **Figure 4**: This appears to show statistics and analysis. Let me examine it carefully:\n - Part (a): Shows verb-noun structure for instructions - this appears to be a bar chart\n - Part (b): Shows verb-noun structure for responses - this appears to be a bar chart\n - Part (c): Shows \"Diversity in our dataset\" with what appears to be distribution plots\n\nLooking at Figure 4 more carefully:\n- (a) contains what looks like a horizontal bar chart showing verbs and nouns from instructions\n- (b) contains what looks like a horizontal bar chart showing verbs and nouns from responses\n- (c) contains multiple distribution plots showing lengths and counts\n\n5. **Figure 5**: Shows response examples - no bar plots\n6. **Figure 6**: The document mentions Fig. 6(b) and Fig. 6(c) but these are not fully visible in the provided pages\n\nFrom what I can see clearly in Figure 4:\n- Subfigure (a): 1 bar plot (showing instruction verb-noun structure)\n- Subfigure (b): 1 bar plot (showing response verb-noun structure)\n\nTotal visible bar plots: 2\n\nAnswer: 2", "citations": [], "input_tokens": 20240, "output_tokens": 354, "cost_micros": 66030, "latency_ms": 11328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "6"}}
{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:18Z", "latency_ms": 52514, "error": null, "raw_text_chars": 860}]}, "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll examine pages 5-10 to count the tables.\n\nPage 5: No tables visible in the text content shown.\n\nPage 6: Table 1 is present - \"Llama 2 family of models\" showing training data, parameters, context length, GQA, tokens, and LR for Llama 1 and Llama 2 models.\n\nPage 7: \n- Table 2: \"CO2 emissions during pretraining\" showing time, power consumption, and carbon emitted for different Llama 2 models.\n- Table 3: \"Overall performance on grouped academic benchmarks\" comparing MPT, Falcon, Llama 1, and Llama 2 models.\n\nPage 8:\n- Table 4: \"Comparison to closed-source models on academic benchmarks\" showing results for GPT-3.5, GPT-4, PaLM, PaLM-2-L, and Llama 2.\n- Table 5: \"SFT annotation\" showing examples of helpfulness and safety annotations.\n\nPage 9: No complete tables visible in the shown content.\n\nPage 10: No tables visible in the shown content.\n\nAnswer: 5", "citations": [], "input_tokens": 199213, "output_tokens": 280, "cost_micros": 601839, "latency_ms": 52468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:25Z", "latency_ms": 40452, "error": null, "raw_text_chars": 575}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 on page 4, which shows \"Safety human evaluation results for Llama 2-Chat compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is better for safety).\n\nFrom the bar chart showing \"Violation % (lower is safer)\":\n- Llama 2-70b-chat has the lowest violation percentage (approximately 0-5%)\n- Llama 2-13b-chat has the second lowest (approximately 0-5%)\n- Llama 2-34b-chat has the third lowest (approximately 5-10%)\n\nAnswer: Llama 2-70b-chat, Llama 2-13b-chat, Llama 2-34b-chat", "citations": [], "input_tokens": 199224, "output_tokens": 182, "cost_micros": 600402, "latency_ms": 40406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:05Z", "latency_ms": 10343, "error": null, "raw_text_chars": 376}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I can see the violation percentages (lower is safer):\n\nThe top-3 safest models (lowest violation %) are:\n1. Llama-2 7b-chat: 3%\n2. Llama-2 13b-chat: 3%\n3. Llama-2 70b-chat: 4%\n\nAnswer: Llama-2 7b-chat, Llama-2 13b-chat, Llama-2 70b-chat", "citations": [], "input_tokens": 105645, "output_tokens": 151, "cost_micros": 319200, "latency_ms": 10343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 70b chat, llama 2 7b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:10Z", "latency_ms": 29391, "error": null, "raw_text_chars": 525}]}, "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the QUAC (f1) 0-shot column.\n\nFrom the table, the QUAC (f1) 0-shot scores are:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score is 44.1, which belongs to LLAMA 1 33B.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 105649, "output_tokens": 274, "cost_micros": 321057, "latency_ms": 29391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:15Z", "latency_ms": 43610, "error": null, "raw_text_chars": 650}]}, "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 9 and count the emojis in the left and right subfigures.\n\nLooking through the document, I find Figure 9 on page 16 (document page 16). The figure shows \"Issues with multi-turn memory (left) can be improved with GAtt (right).\"\n\nLooking at the left subfigure:\n- It shows a conversation with text-only responses (no emojis visible)\n\nLooking at the right subfigure:\n- First response: 4 emojis (\ud83d\udc4b\ud83d\udc41\ufe0f\ud83d\udc44\ud83d\udc41\ufe0f)\n- Second response: 4 emojis (\u2708\ufe0f\u2708\ufe0f\ud83d\udeeb\ud83d\udcba)\n- Third response: 4 emojis (\ud83e\uddb4\ud83e\udd95\ud83e\uddb4\ud83d\udd25)\n- Fourth response: 2 emojis (\ud83c\uddee\ud83c\uddf9\ud83c\udf5d\ud83d\udc90)\n\nTotal in right subfigure: 4 + 4 + 4 + 2 = 14 emojis\n\nTotal in left subfigure: 0 emojis\n\nDifference: 14 - 0 = 14\n\nAnswer: 14", "citations": [], "input_tokens": 199223, "output_tokens": 267, "cost_micros": 601674, "latency_ms": 43578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "15"}}
{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "original_error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:40Z", "latency_ms": 43593, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:53:24Z", "latency_ms": 44391, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:54:10Z", "latency_ms": 35797, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:54:48Z", "latency_ms": 42266, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:55:41Z", "latency_ms": 31983, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 31843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "blue"}}
{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:59Z", "latency_ms": 33250, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:53:33Z", "latency_ms": 40889, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:54:16Z", "latency_ms": 51515, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:55:10Z", "latency_ms": 54500, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:56:14Z", "latency_ms": 32641, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "youtube music"}}
{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:13Z", "latency_ms": 42110, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:56:56Z", "latency_ms": 52202, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:57:50Z", "latency_ms": 40469, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:58:34Z", "latency_ms": 49766, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:59:30Z", "latency_ms": 42110, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 41968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "49.99"}}
{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "azure_premium_lc", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:47Z", "latency_ms": 24578, "error": null, "raw_text_chars": 640}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and identify the price of the keyboard shown in the screenshot.\n\nLooking through the document, I can find Figure 111 which is part of Section 9.8 on GUI Navigation, specifically about Online Shopping for an Ergonomic Keyboard.\n\nIn Figure 111, the screenshot shows a shopping cart with a keyboard item. The text indicates:\n\n\"Subtotal $4999\"\n\nAnd in the cart details, it shows:\n\"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...\n$4999\"\n\nThe price shown is $49.99 (represented as $4999 in the format used in the screenshot, which appears to be missing the decimal point).\n\nAnswer: 49.99", "citations": [], "input_tokens": 133648, "output_tokens": 176, "cost_micros": 403584, "latency_ms": 24578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:57:11Z", "latency_ms": 69062, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:58:21Z", "latency_ms": 35639, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:58:59Z", "latency_ms": 51312, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:59:54Z", "latency_ms": 49218, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:00:48Z", "latency_ms": 56375, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "76.0"}}
{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "azure_basic_lc", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:12Z", "latency_ms": 7967, "error": null, "raw_text_chars": 594}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find Figure 107 in the document and identify the battery percentage shown in that screenshot.\n\nLooking through the document for Figure 107, I can see it's in Section 9.8 on GUI Navigation, specifically related to online shopping.\n\nIn the image caption list, Figure 107 is described as \"Section 9.8: online shopping.\"\n\nLooking at the actual Figure 107 in the document, I can see it's a smartphone screenshot showing an Amazon shopping interface. At the top of the screenshot, there is a time display \"5:32\" and next to it shows \"75\" which represents the battery percentage.\n\nAnswer: 75", "citations": [], "input_tokens": 117429, "output_tokens": 143, "cost_micros": 354432, "latency_ms": 7967, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "75.0", "normalised_gold": "76.0"}}
{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:20Z", "latency_ms": 33344, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:00:54Z", "latency_ms": 43733, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:01:39Z", "latency_ms": 33390, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:02:16Z", "latency_ms": 33641, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:02:59Z", "latency_ms": 38000, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "ukraine"}}
{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:01:45Z", "latency_ms": 35218, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:02:21Z", "latency_ms": 44921, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:03:08Z", "latency_ms": 52842, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:04:05Z", "latency_ms": 27250, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:04:41Z", "latency_ms": 48031, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 47889, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "jooodeee"}}
{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:37Z", "latency_ms": 20843, "error": null, "raw_text_chars": 345}]}, "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically on page 2, the task formulation section discusses the \"Setting of Conscious Incompetence.\"\n\nThe relevant text states: \"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph G.\"\n\nAnswer: if it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 60205, "output_tokens": 80, "cost_micros": 181815, "latency_ms": 20828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "if it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:57Z", "latency_ms": 14639, "error": null, "raw_text_chars": 1219}]}, "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to identify which modules use Large Language Models (LLMs) as their backbone.\n\nFrom Section 3, the QACHECK system has 5 modules:\n1. **Claim Verifier** - Section 3.1 states it's \"based on InstructGPT\" (an LLM)\n2. **Question Generator** - Section 3.2 states \"Similar to the claim verifier, we also leverage InstructGPT\" (an LLM)\n3. **Question Answering Model** - Section 3.3 describes three implementations, with the default being \"GPT Reciter-Reader\" which uses InstructGPT (an LLM). However, this module also has non-LLM options (Retriever-Reader, FLAN-T5).\n4. **QA Validator** - Section 3.4 states \"We again implement the validator by prompting InstructGPT\" (an LLM)\n5. **Reasoner** - Section 3.5 provides \"two different implementations: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\"\n\nCounting the modules that use LLM backbones:\n- Claim Verifier: Yes (InstructGPT)\n- Question Generator: Yes (InstructGPT)\n- Question Answering Model: Yes (uses GPT Reciter-Reader by default)\n- QA Validator: Yes (InstructGPT)\n- Reasoner: Yes (InstructGPT is one of the two implementations)\n\nAll 5 modules have LLM implementations, with some offering alternatives.\n\nAnswer: 5", "citations": [], "input_tokens": 27149, "output_tokens": 403, "cost_micros": 87492, "latency_ms": 14639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:12Z", "latency_ms": 6562, "error": null, "raw_text_chars": 406}]}, "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the sociology subject in MMMU.\n\nIn Figure 3, which shows the distribution of subjects across disciplines, I can see:\n\nUnder \"Humanities & Social Sci. (9%)\":\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Psychology (340, 2.94%)\n- **Sociology (287, 2.48%)**\n\nThe number 287 represents the number of samples in the sociology subject.\n\nAnswer: 287", "citations": [], "input_tokens": 103232, "output_tokens": 123, "cost_micros": 311541, "latency_ms": 6562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:19Z", "latency_ms": 62062, "error": null, "raw_text_chars": 760}]}, "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Economics-related error cases that fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics error cases:\n\nFrom the Case Study section (pages 20-21), I can see Table 10 which shows the index of case study figures by subjects with associated error categories.\n\nLooking at the Economics row in Table 10:\n- Economics has: Correct Case (22), Perceptual Error (23, 24)\n\nThis indicates that there are 2 Economics-related error cases in the Perceptual Error category (Figures 23 and 24).\n\nLet me verify by checking the actual figures:\n- Figure 23 (page 37): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n- Figure 24 (page 38): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n\nAnswer: 2", "citations": [], "input_tokens": 259127, "output_tokens": 212, "cost_micros": 1559532, "latency_ms": 62014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_basic_lc", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:21Z", "latency_ms": 5093, "error": null, "raw_text_chars": 174}]}, "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists the subfields of each subject:\n\nFor Energy & Power, the subfields are:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 83153, "output_tokens": 47, "cost_micros": 250164, "latency_ms": 5093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:26Z", "latency_ms": 64485, "error": null, "raw_text_chars": 1513}]}, "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 1 in the document and count the different icons shown as different image types.\n\nLooking at page 1, Figure 1 is titled \"Overview of the MMMU dataset\" and shows four main sections:\n\n1. **Comprehensive Disciplines** - Shows icons/images representing different disciplines (Engineering, Science, Art & Design, Business, Humanities & Social Sci., Medicine)\n\n2. **Heterogeneous Image Types** - Shows various image type icons including:\n - Diagrams\n - Tables\n - Plots and Charts\n - Photographs\n - Chemical Structures\n - Paintings\n - Medical Images\n - Sheet Music\n - Geometric\n - Pathology images\n - Microscopic Images\n - Comics\n\n3. **Interleaved Text and Images** - Shows a medical question example with 3 MRI images\n\n4. **Expert-level Skills Test** - Shows a diagram with three components: Perception, Knowledge, and Reasoning\n\nThe question asks specifically about \"different icon are shown as different image types\" in Figure 1. Looking at the \"Heterogeneous Image Types\" section, I can count the distinct image type icons shown.\n\nCounting the image types listed in the \"Heterogeneous Image Types\" section:\nDiagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, and there's an ellipsis (...) suggesting more types.\n\nBased on what's explicitly visible and labeled in Figure 1, I can count approximately 12 distinct image type representations shown.\n\nAnswer: 12", "citations": [], "input_tokens": 259123, "output_tokens": 378, "cost_micros": 1563243, "latency_ms": 64406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:29Z", "latency_ms": 75843, "error": null, "raw_text_chars": 831}]}, "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts that appear in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Figure 4: A comparison chart showing benchmarks plotted on a 2D graph with \"Breadth (Knowledge)\" on x-axis and \"Depth (Reasoning)\" on y-axis\n- A table comparing different datasets\nTotal: 1 chart\n\n**Page 6:**\n- Table 2: Results table (not a chart)\nTotal: 0 charts\n\n**Page 7:**\n- Figure 5: Bar chart showing \"Performance of models on different types of images\"\n- Table 3: Results table (not a chart)\nTotal: 1 chart\n\n**Page 8:**\n- Figure 6: Pie chart showing \"Error distribution over 150 annotated GPT-4V errors\"\nTotal: 1 chart\n\n**Page 9:**\n- No charts, only text\nTotal: 0 charts\n\n**Page 10:**\n- No charts, only text (references section)\nTotal: 0 charts\n\nTotal charts in pages 5-10: 1 + 0 + 1 + 1 + 0 + 0 = 3\n\nAnswer: 3", "citations": [], "input_tokens": 259120, "output_tokens": 289, "cost_micros": 1561222, "latency_ms": 75796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:30Z", "latency_ms": 77217, "error": null, "raw_text_chars": 404}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 on page 8 of the document, which shows the \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe error categories and their percentages are:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 259144, "output_tokens": 133, "cost_micros": 1557856, "latency_ms": 77171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "azure_premium_lc", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:45Z", "latency_ms": 7750, "error": null, "raw_text_chars": 387}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe pie chart shows the following error categories and their percentages:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 109199, "output_tokens": 125, "cost_micros": 329472, "latency_ms": 7750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:53Z", "latency_ms": 59125, "error": null, "raw_text_chars": 830}]}, "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll search through pages 95-100 for error types mentioned in the case studies.\n\nFrom page 95 (Figure 80 - Architecture and Engineering: Surveying and Mapping):\n- This is a correct case, no error type mentioned\n\nFrom page 96 (Figure 82 - Architecture and Engineering: Civil Engineering):\n- Error Category: Reasoning Error\n\nFrom page 97 (Figure 83 - Computer Science: Data Structure and Algorithm):\n- This is a correct case, no error type mentioned\n\nFrom page 98 (Figure 84 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error, Lack of Knowledge\n\nFrom page 99 (Figure 85 - Computer Science: Operating System):\n- Error Category: Perceptual Error\n\nFrom page 100 (Figure 86 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 259136, "output_tokens": 226, "cost_micros": 1559901, "latency_ms": 59077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:48Z", "latency_ms": 11110, "error": null, "raw_text_chars": 1296}]}, "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases (correct predictions by GPT-4V) in Pages 65-70.\n\nLet me go through each figure in this range:\n\n**Page 65 - Figure 51 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (E) Familial polyposis coli\n- Ground Truth: (E) Familial polyposis coli\n- **SUCCESS**\n\n**Page 66 - Figure 52 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (D) Ruptured berry aneurysm\n- Ground Truth: (D) Ruptured berry aneurysm\n- **SUCCESS**\n\n**Page 67 - Figure 53 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 68 - Figure 54 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 69 - Figure 55 (Clinical Medicine: Heart Disease)**\n- GPT-4V answered (B) Mitral regurgitation\n- Ground Truth: (C) Aortic value regurgitation\n- Error Category: Lack of Knowledge\n- **FAILURE**\n\n**Page 70 - Figure 56 (Diagnostics and Laboratory Medicine: Radiology)**\n- GPT-4V correctly identified (C) Dandy-Walker syndrome\n- Ground Truth: (C) Dandy-Walker syndrome\n- **SUCCESS**\n\nTotal successful cases: 3\n\nAnswer: 3", "citations": [], "input_tokens": 103232, "output_tokens": 424, "cost_micros": 316056, "latency_ms": 11110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:52Z", "latency_ms": 23593, "error": null, "raw_text_chars": 1091}]}, "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the figure with a tree shape and identify which branch has the least leaves.\n\nLooking through the document, I can see Figure 1 on page 2, which shows a \"Technology tree of RAG research\" with a tree-like structure.\n\nExamining this tree diagram, I can see it has a trunk labeled \"Retrieval-Augmented Generation\" and branches extending upward. The main branches appear to be:\n\n1. A left branch (colored in teal/green) showing the evolution from pre-training through fine-tuning stages\n2. A middle branch (colored in orange) labeled \"Instructlens\" \n3. A right branch (colored in blue) showing the inference stage with multiple methods\n\nLooking at the number of \"leaves\" (individual methods/papers) on each branch:\n- The left branch (Pre-training/Fine-tuning) has several leaves\n- The middle branch (Instructlens) appears to have the fewest leaves - it's relatively short with minimal branching\n- The right branch (Inference) has many leaves with extensive branching\n\nThe branch labeled \"Instructlens\" in the middle appears to have the least number of leaves.\n\nAnswer: Instructlens", "citations": [], "input_tokens": 64633, "output_tokens": 269, "cost_micros": 197934, "latency_ms": 23577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "instructlens", "normalised_gold": "pre training"}}
{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "arm": "azure_premium_lc", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:59Z", "latency_ms": 7156, "error": null, "raw_text_chars": 752}]}, "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Appendix F \"PSEUDOCODE FOR RETRIEVAL METHODS\":\n\n**Algorithm 1 Tree Traversal Algorithm:**\n- Lines: function declaration, initialization of Scurrent, for layer loop, initialization of topk, for node loop, score calculation, append to top_k, end for, Slayer assignment, Scurrent update, end for, return statement, end function\n- Total: 13 lines\n\n**Algorithm 2 Collapsed Tree Algorithm:**\n- Lines: function declaration, flatten tree, initialization of top_nodes, for node loop, append with dot product, end for, sort top_nodes, result initialization, total_tokens initialization, for node loop, if condition, result append, end if, total_tokens update, end for, return statement, end function\n- Total: 16 lines\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 27211, "output_tokens": 190, "cost_micros": 84483, "latency_ms": 7156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:08:06Z", "latency_ms": 16641, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:08:23Z", "latency_ms": 11860, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:08:36Z", "latency_ms": 12422, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:08:54Z", "latency_ms": 19625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:09:25Z", "latency_ms": 14078, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "figure 5, figure 6"}}
{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:08:16Z", "latency_ms": 13922, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:08:30Z", "latency_ms": 25952, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:08:57Z", "latency_ms": 14718, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:09:14Z", "latency_ms": 33452, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:09:54Z", "latency_ms": 22093, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "text tokens"}}
{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:09:39Z", "latency_ms": 11906, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:09:52Z", "latency_ms": 15921, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:10:10Z", "latency_ms": 21921, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:10:34Z", "latency_ms": 11906, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:10:53Z", "latency_ms": 13686, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "18"}}
{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:10:16Z", "latency_ms": 14281, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:10:32Z", "latency_ms": 12125, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:10:45Z", "latency_ms": 11952, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:11:01Z", "latency_ms": 20641, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:11:30Z", "latency_ms": 13985, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "1"}}
{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:11:06Z", "latency_ms": 17157, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"{\\\"message\\\":\\\"Input is too long.\\\"}\",\"provider_name\":\"Amazon Bedrock\",\"is_byok\":false}},\"user_id\":\"user_3CNdnY1vL3Ln9TYRiGAii5kmBvu\"}", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:11:25Z", "latency_ms": 12125, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:11:40Z", "latency_ms": 13734, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:11:59Z", "latency_ms": 15828, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:12:25Z", "latency_ms": 19985, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "no"}}
{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "original_error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:11:44Z", "latency_ms": 15875, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:12:00Z", "latency_ms": 16625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:12:19Z", "latency_ms": 12937, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:12:35Z", "latency_ms": 14625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:12:57Z", "latency_ms": 12156, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n \\\"error\\\": {\\n \\\"code\\\": 400,\\n \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "150k"}}

View file

@ -0,0 +1,100 @@
{
"config": {
"base_delay": 1.0,
"concurrency": 2,
"llm_model": "anthropic/claude-sonnet-4.5",
"max_attempts": 5,
"max_delay": 30.0,
"max_output_tokens": 512,
"pdf_engine": "native"
},
"elapsed_s": 1373.6,
"n_failed_rows_input": 37,
"n_retried": 37,
"per_arm": {
"azure_basic_lc": {
"attempts_distribution": [
1
],
"recovered": 1,
"recovery_rate": 1.0,
"still_failed": 0,
"tried": 1
},
"azure_premium_lc": {
"attempts_distribution": [
1,
1,
1
],
"recovered": 3,
"recovery_rate": 1.0,
"still_failed": 0,
"tried": 3
},
"llamacloud_basic_lc": {
"attempts_distribution": [
1,
1
],
"recovered": 2,
"recovery_rate": 1.0,
"still_failed": 0,
"tried": 2
},
"llamacloud_premium_lc": {
"attempts_distribution": [
1,
1,
1,
1
],
"recovered": 4,
"recovery_rate": 1.0,
"still_failed": 0,
"tried": 4
},
"native_pdf": {
"attempts_distribution": [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5
],
"recovered": 15,
"recovery_rate": 0.5555555555555556,
"still_failed": 12,
"tried": 27
}
},
"raw_retries_path": "data\\multimodal_doc\\runs\\2026-05-14T00-53-19Z\\parser_compare\\raw_retries.jsonl",
"run_id": "2026-05-14T00-53-19Z",
"totals": {
"recovered": 25,
"still_failed": 12,
"tried": 37
}
}

View file

@ -0,0 +1,63 @@
[project]
name = "surfsense-evals"
version = "0.1.0"
description = "Domain-agnostic evaluation harness for SurfSense (medical RAG suite ships first; legal/finance/code suites slot in under suites/)."
readme = "README.md"
requires-python = ">=3.12"
license = { text = "Apache-2.0" }
authors = [{ name = "SurfSense" }]
dependencies = [
"httpx>=0.27.0",
"httpx-sse>=0.4.0",
"datasets>=2.21.0",
"huggingface_hub>=0.24.0",
"reportlab>=4.0.0",
"Pillow>=10.0.0",
"pyarrow>=15.0.0",
"pydantic>=2.6.0",
"tqdm>=4.66.0",
"numpy>=1.26.0",
"scikit-learn>=1.4.0",
"scipy>=1.12.0",
"python-dotenv>=1.0.0",
"rich>=13.7.0",
"trafilatura>=1.12.0",
"pypdf>=5.1.0",
"azure-ai-documentintelligence>=1.0.2",
"llama-cloud-services>=0.6.25",
]
[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"pytest-asyncio>=0.23.0",
"respx>=0.21.0",
"ruff>=0.5.0",
]
[project.scripts]
surfsense-evals = "surfsense_evals.core.cli:main"
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["src"]
include = ["surfsense_evals*"]
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]
markers = [
"integration: opt-in tests that hit a live SurfSense instance (run with `-m integration`)",
]
[tool.ruff]
line-length = 100
target-version = "py312"
[tool.ruff.lint]
select = ["E", "F", "I", "B", "UP", "SIM", "ASYNC"]
ignore = ["E501"]

13
surfsense_evals/reports/.gitignore vendored Normal file
View file

@ -0,0 +1,13 @@
# Default: don't track auto-generated `summary.md` / `summary.json` from
# every benchmark run — those are derivative of `data/.../runs/<id>/`.
*
!.gitignore
# Hand-curated sample report kept as a reference for the medical suite.
!medical/
!medical/sample_summary.md
# Hand-curated blog-ready writeups (one per experiment) — these *are*
# the public citation surface and must travel with the repo.
!blog/
!blog/*.md

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,125 @@
"""Were the SSL failures clustered in time (network blip) or evenly
distributed (sustained limit)? Group failures by 1-min buckets using
the run start time and the per-row latency_ms / answer order.
Also: for the one *real* intrinsic failure the 30MB Anthropic limit
on 2405.09818v1.pdf::Q007 print the full error message + raw payload
sizes so the blog has a clean root cause.
"""
from __future__ import annotations
import json
from collections import Counter, defaultdict
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"
PDFS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
def main() -> None:
rows = [
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
if line.strip()
]
# 1) SSL clustering: failures by question index per arm
by_arm_idx: dict[str, list[tuple[int, str]]] = defaultdict(list)
qid_order: dict[str, int] = {}
arm_seen_count: dict[str, int] = defaultdict(int)
for row in rows:
arm = row["arm"]
idx = arm_seen_count[arm]
arm_seen_count[arm] += 1
qid_order[f"{arm}::{row['qid']}"] = idx
err = row.get("error") or ""
cluster = "ssl" if "SSLError" in err else (
"empty" if not (row.get("raw_text") or "").strip() and not err else (
"5xx" if "502" in err or "503" in err else (
"size_limit" if "exceeds" in err.lower() and "limit" in err.lower() else (
"other_err" if err else "ok"
)
)
)
)
if cluster != "ok":
by_arm_idx[arm].append((idx, cluster))
print("=" * 80)
print("SSL/network-error indices per arm (each arm processes 171 questions in")
print("order; index = sequential position within that arm). Tight clustering")
print("in time = transient blip, even spread = sustained limit.")
print("=" * 80)
for arm in sorted(by_arm_idx):
items = by_arm_idx[arm]
if not items:
continue
idxs = sorted(set(i for i, _ in items))
print(f"\n{arm}: {len(items)} failures at indices {idxs}")
# show clusters
cluster_runs = []
cur = [idxs[0]]
for i in idxs[1:]:
if i - cur[-1] <= 5: # within 5 questions = same time window
cur.append(i)
else:
cluster_runs.append(cur)
cur = [i]
cluster_runs.append(cur)
print(f" clusters (gap<=5): {len(cluster_runs)}: {cluster_runs}")
# 2) The 30MB intrinsic failure — full details
print()
print("=" * 80)
print("Intrinsic failure: 30MB Anthropic input limit on 2405.09818v1.pdf::Q007")
print("=" * 80)
for row in rows:
if row["qid"] == "2405.09818v1.pdf::Q007" and row["arm"] == "native_pdf":
err = row.get("error") or ""
print(f" qid: {row['qid']}")
print(f" doc: {row['doc_id']}, pages: {row.get('pages')}")
pdf_path = PDFS / row["doc_id"]
if pdf_path.exists():
size_mb = pdf_path.stat().st_size / (1024 * 1024)
print(f" PDF size on disk: {size_mb:.1f} MB")
# base64 inflates ~33%
est_b64 = size_mb * 1.33
print(f" estimated base64 wire size: {est_b64:.1f} MB")
print(f" full error: {err[:600]}")
break
# 3) Per-PDF: which PDFs are pathological?
print()
print("=" * 80)
print("Per-PDF failure breakdown across all 6 arms (only PDFs with failures)")
print("=" * 80)
by_pdf: dict[str, list[dict]] = defaultdict(list)
for row in rows:
err = row.get("error") or ""
empty = not (row.get("raw_text") or "").strip()
if err or empty:
by_pdf[row["doc_id"]].append({
"arm": row["arm"],
"qid": row["qid"],
"err_kind": (
"ssl" if "SSLError" in err
else "size_limit" if "exceeds" in err.lower() and "limit" in err.lower()
else "5xx" if "502" in err or "503" in err
else "json_decode" if "JSONDecodeError" in err
else "empty" if empty and not err
else "other"
),
"pages": row.get("pages"),
})
for doc, items in sorted(by_pdf.items(), key=lambda x: (-len(x[1]), x[0])):
kinds = Counter(i["err_kind"] for i in items)
arms = sorted({i["arm"] for i in items})
pages = items[0]["pages"]
print(f" {doc} pages={pages} failures={len(items)} arms={arms}")
print(f" kinds: {dict(kinds)}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,155 @@
"""Drill into the parser_compare n=171 raw.jsonl to surface every
failure, group by arm + PDF, and dump the underlying error strings so
we can write up a clean failure-mode taxonomy for the blog post.
Outputs (printed to stdout + written to `failures_n171.json`):
* per-arm failure count and rate
* per-PDF failure count across all arms (which docs are pathological?)
* error-string clusters per arm (so we can give human-readable causes)
* sample failure rows (one per cluster) for the appendix
"""
from __future__ import annotations
import json
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any
REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"
OUT = REPO / "scripts" / "failures_n171.json"
def _classify(error: str | None, raw_text: str) -> str:
"""Coarse-grained bucket for an error message."""
blob = (error or "").lower()
if not blob and not raw_text.strip():
return "empty_response"
if "rate limit" in blob or "429" in blob:
return "rate_limit"
if "context_length" in blob or "context window" in blob or "too many tokens" in blob:
return "context_overflow"
if "could not process image" in blob or "invalid image" in blob:
return "image_decode_failure"
if "could not process pdf" in blob or "invalid_request_error" in blob and "pdf" in blob:
return "pdf_decode_failure"
if "timeout" in blob or "timed out" in blob:
return "timeout"
if "5xx" in blob or "internal server error" in blob or "503" in blob or "502" in blob:
return "provider_5xx"
if "filenotfound" in blob:
return "missing_extraction"
if "badrequest" in blob:
return "provider_400"
if blob:
return "other_error"
return "unknown"
def main() -> None:
rows = [
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
if line.strip()
]
by_arm_failures: dict[str, list[dict]] = defaultdict(list)
by_pdf_failures: dict[str, list[dict]] = defaultdict(list)
error_clusters: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
n_per_arm: dict[str, int] = defaultdict(int)
for row in rows:
arm = row["arm"]
n_per_arm[arm] += 1
err = row.get("error")
raw_text = row.get("raw_text") or ""
if err or not raw_text.strip():
cluster = _classify(err, raw_text)
entry = {
"qid": row["qid"],
"doc_id": row["doc_id"],
"answer_format": row["answer_format"],
"gold": row["gold"],
"error": err,
"cluster": cluster,
"raw_text_len": len(raw_text),
"pages": row.get("pages"),
}
by_arm_failures[arm].append(entry)
by_pdf_failures[row["doc_id"]].append({**entry, "arm": arm})
error_clusters[arm][cluster].append(entry)
print("=" * 90)
print("Per-arm failure count & rate")
print("=" * 90)
print(f"{'arm':<25} {'n':>4} {'fail':>5} {'rate%':>6}")
for arm in sorted(n_per_arm):
f = len(by_arm_failures[arm])
n = n_per_arm[arm]
print(f"{arm:<25} {n:>4} {f:>5} {f / n * 100:>5.1f}%")
print()
print("=" * 90)
print("Failure clusters per arm")
print("=" * 90)
for arm in sorted(error_clusters):
print(f"\n{arm}:")
for cluster, items in sorted(error_clusters[arm].items()):
print(f" {cluster:<22} {len(items):>3}")
sample = items[0]
err_short = (sample["error"] or "")[:200].replace("\n", " ")
print(f" example: {sample['qid']} doc={sample['doc_id']} pages={sample['pages']}")
print(f" error: {err_short}")
print()
print("=" * 90)
print("Per-PDF failure totals (PDFs with >=2 failures)")
print("=" * 90)
pdf_counts = Counter({pdf: len(rows) for pdf, rows in by_pdf_failures.items()})
for pdf, count in pdf_counts.most_common():
if count < 2:
break
arms_failed = sorted({r["arm"] for r in by_pdf_failures[pdf]})
pages = by_pdf_failures[pdf][0].get("pages")
print(f" {pdf} pages={pages} failures={count} arms={arms_failed}")
print()
print("=" * 90)
print("All native_pdf failures (one row per failure)")
print("=" * 90)
for entry in by_arm_failures.get("native_pdf", []):
err = (entry["error"] or "(no error string)")[:240].replace("\n", " ")
print(f" {entry['qid']} doc={entry['doc_id']} pages={entry['pages']} cluster={entry['cluster']}")
print(f" err: {err}")
summary: dict[str, Any] = {
"per_arm": {
arm: {
"n": n_per_arm[arm],
"failures": len(by_arm_failures[arm]),
"rate": len(by_arm_failures[arm]) / n_per_arm[arm],
"clusters": {
cluster: len(items)
for cluster, items in error_clusters[arm].items()
},
"rows": by_arm_failures[arm],
}
for arm in sorted(n_per_arm)
},
"per_pdf": {
pdf: [
{**r, "arm": r["arm"]} for r in failures
]
for pdf, failures in by_pdf_failures.items()
},
}
OUT.write_text(json.dumps(summary, indent=2), encoding="utf-8")
print(f"\nWrote: {OUT}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,60 @@
"""Sanity check extraction sizes against Sonnet 4.5's context window.
Sonnet 4.5 supports ~200k tokens. As a *very* rough heuristic, English
markdown is ~4 chars/token, so anything over ~750k chars likely won't
fit alongside the system + question + 512 max_output_tokens. Print
warnings for any extraction that's at risk.
"""
from __future__ import annotations
import json
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
MAP = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
CHARS_PER_TOKEN = 4
CTX_TOKENS = 200_000
PROMPT_OVERHEAD_TOKENS = 1_000 # system + question + format hint
MAX_OUTPUT_TOKENS = 512
SAFE_CHARS = (CTX_TOKENS - PROMPT_OVERHEAD_TOKENS - MAX_OUTPUT_TOKENS) * CHARS_PER_TOKEN
def main() -> None:
rows = [
json.loads(line)
for line in MAP.read_text(encoding="utf-8").splitlines()
if line.strip()
]
total = len(rows)
arm_max: dict[str, tuple[int, str]] = {}
overflows: list[tuple[str, str, int]] = []
for row in rows:
for arm, ext in (row.get("extractions") or {}).items():
chars = int(ext.get("chars") or 0)
if arm not in arm_max or arm_max[arm][0] < chars:
arm_max[arm] = (chars, row["doc_id"])
if chars > SAFE_CHARS:
overflows.append((row["doc_id"], arm, chars))
print(f"PDFs in manifest: {total}")
print(f"safe char budget: {SAFE_CHARS:,} (~{(SAFE_CHARS // CHARS_PER_TOKEN):,} tokens)")
print()
print("largest extraction per arm:")
for arm, (chars, doc_id) in sorted(arm_max.items()):
print(f" {arm:25s} {chars:>10,} chars ({doc_id})")
print()
if overflows:
print(f"OVERFLOW RISK ({len(overflows)} extractions > safe budget):")
for doc_id, arm, chars in overflows:
est_tokens = chars // CHARS_PER_TOKEN
print(f" {doc_id} :: {arm} :: {chars:,} chars (~{est_tokens:,} tokens)")
else:
print("no overflow risk — all extractions fit Sonnet 4.5's 200k context.")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,77 @@
"""Query SurfSense for the status of every MMLongBench PDF in scope.
Uses the existing SurfSense documents client to query
``/documents/status?document_ids=...`` for both the known-existing 5
PDFs (doc ids 5219-5223) and the recently-uploaded mmlongbench batch
(7577-7600 range).
"""
from __future__ import annotations
import asyncio
import os
from pathlib import Path
import httpx
from dotenv import load_dotenv
REPO = Path(__file__).resolve().parents[1]
PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
async def main() -> None:
load_dotenv(REPO / ".env")
base = os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/")
token = os.environ.get("SURFSENSE_JWT")
if not token:
raise SystemExit("SURFSENSE_JWT missing from .env")
pdf_names = sorted(p.name for p in PDF_DIR.glob("*.pdf"))
print(f"local cached PDFs: {len(pdf_names)}")
candidate_ids = list(range(5219, 5224)) + list(range(7577, 7625))
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/json",
}
async with httpx.AsyncClient(timeout=30.0) as http:
r = await http.get(
f"{base}/api/v1/documents/status",
params={
"search_space_id": 55,
"document_ids": ",".join(str(d) for d in candidate_ids),
},
headers=headers,
)
r.raise_for_status()
items = r.json().get("items", [])
by_title: dict[str, dict] = {}
for it in items:
by_title[it.get("title", "")] = {
"id": it.get("id"),
"state": (it.get("status") or {}).get("state"),
"reason": (it.get("status") or {}).get("reason"),
}
by_state: dict[str, int] = {}
print()
for name in pdf_names:
info = by_title.get(name)
if info is None:
print(f" [missing ] {name}")
by_state["missing"] = by_state.get("missing", 0) + 1
else:
tag = info["state"] or "?"
print(f" [{tag:13s}] doc_id={info['id']:>5} {name}")
by_state[tag] = by_state.get(tag, 0) + 1
print()
print("summary:")
for k, v in sorted(by_state.items()):
print(f" {k}: {v}")
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,112 @@
"""Compute "intrinsic" accuracy by removing transient network errors.
A failure is *transient* if it's:
* SSLError: SSL bad-record-mac (TLS hiccup)
* Cloudflare 502 / 503 (provider-side load shedding)
* empty_response with no error string and no other signal (likely
connection reset mid-stream)
* JSONDecodeError (parse error mid-stream)
A failure is *intrinsic* if it's a hard limit:
* "exceeds .* limit" (size limits)
* context_length errors
* provider 400 with image / pdf decode failure
* malformed-input failures
We re-compute accuracy with two denominators:
* raw acc = correct / 171 (what the headline reports)
* adjusted acc = correct / (171 - transient_failures) (intrinsic)
Outputs a table that we can drop straight into the blog.
"""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"
TRANSIENT_HINTS = (
"sslv3_alert_bad_record_mac",
"ssl_alert_bad_record_mac",
"ssl: ssl",
"cloudflare",
"error 502",
"error 503",
"bad gateway",
"service unavailable",
"gateway timeout",
"jsondecodeerror",
)
INTRINSIC_HINTS = (
"exceeds",
"context_length",
"context window",
"could not process pdf",
"could not process image",
)
def classify(error: str | None, raw_text: str) -> str:
err = (error or "").lower()
if not err and not raw_text.strip():
return "transient_empty"
if any(h in err for h in TRANSIENT_HINTS):
return "transient_ssl_or_5xx"
if any(h in err for h in INTRINSIC_HINTS):
return "intrinsic_limit"
if err:
return "other_error"
return "ok"
def main() -> None:
rows = [
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
if line.strip()
]
by_arm: dict[str, dict] = defaultdict(lambda: {
"n": 0, "correct": 0,
"transient_ssl_or_5xx": 0, "transient_empty": 0,
"intrinsic_limit": 0, "other_error": 0,
})
for row in rows:
arm = row["arm"]
m = by_arm[arm]
m["n"] += 1
graded = row.get("graded") or {}
if graded.get("correct"):
m["correct"] += 1
kind = classify(row.get("error"), row.get("raw_text") or "")
if kind != "ok":
m[kind] += 1
print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")
print("-" * 88)
for arm in sorted(by_arm):
m = by_arm[arm]
raw = m["correct"] / m["n"] * 100
transient = m["transient_ssl_or_5xx"] + m["transient_empty"]
intrinsic = m["intrinsic_limit"]
other = m["other_error"]
usable = m["n"] - transient
adj = m["correct"] / usable * 100 if usable else 0
print(
f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"
)
print()
print("transient = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")
print(" succeed on retry; eval harness has no built-in retry today).")
print("intrinsic = hard limit (e.g. >30MB Anthropic request, model context overflow).")
print("adj acc% = correct / (n - transient) — what the arm scores when network noise")
print(" is removed; closest thing we have to a like-for-like quality number.")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,381 @@
"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
per-PDF heterogeneity, latency/token distribution percentiles.
Reads the merged post-retry artifact:
data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
Outputs to stdout:
1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
2) Per-arm input/output token distribution (mean, p50, p95, max).
3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
two-sided p-value. We include both raw (using the original raw.jsonl)
and post-retry results.
4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
Pure stdlib no scipy/numpy.
"""
from __future__ import annotations
import argparse
import json
import math
import statistics
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
# ---------------------------------------------------------------------------
# I/O
# ---------------------------------------------------------------------------
def _read_jsonl(path: Path) -> list[dict]:
out: list[dict] = []
with path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
out.append(json.loads(line))
return out
# ---------------------------------------------------------------------------
# Distribution helpers
# ---------------------------------------------------------------------------
def _percentile(values: list[float], p: float) -> float:
"""Linear-interpolation percentile (p in [0, 100])."""
if not values:
return 0.0
s = sorted(values)
if len(s) == 1:
return float(s[0])
k = (len(s) - 1) * (p / 100.0)
lo, hi = math.floor(k), math.ceil(k)
if lo == hi:
return float(s[int(k)])
return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
# ---------------------------------------------------------------------------
# McNemar exact-binomial p-value
# ---------------------------------------------------------------------------
def _binom_coef(n: int, k: int) -> int:
if k < 0 or k > n:
return 0
return math.comb(n, k)
def _mcnemar_exact_pvalue(b: int, c: int) -> float:
"""Two-sided exact-binomial McNemar p-value.
Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
The two-sided p-value is
P(X <= min(b, c)) + P(X >= max(b, c))
computed exactly (cheap because b+c <= 27 in our run).
"""
n = b + c
if n == 0:
return 1.0
k = min(b, c)
# Two-sided exact: 2 * P(X <= k) clipped at 1.0
cdf = sum(_binom_coef(n, i) for i in range(k + 1))
p = 2.0 * cdf / (2 ** n)
return min(1.0, p)
def _mcnemar_table(rows: list[dict]) -> dict:
"""Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
by_qid: dict[str, dict[str, bool]] = {}
arms_seen: set[str] = set()
for r in rows:
qid = r["qid"]
arm = r["arm"]
graded = r.get("graded") or {}
correct = bool(graded.get("correct"))
by_qid.setdefault(qid, {})[arm] = correct
arms_seen.add(arm)
arms = sorted(arms_seen)
qids = sorted(by_qid)
out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
for i, ai in enumerate(arms):
for aj in arms[i + 1:]:
b = c = both = neither = 0
for q in qids:
row = by_qid[q]
if ai not in row or aj not in row:
continue
ci, cj = row[ai], row[aj]
if ci and not cj:
b += 1
elif cj and not ci:
c += 1
elif ci and cj:
both += 1
else:
neither += 1
p = _mcnemar_exact_pvalue(b, c)
out["pairs"].append({
"arm_i": ai, "arm_j": aj,
"b_i_only": b, "c_j_only": c,
"both_correct": both, "both_wrong": neither,
"p_value": p,
})
return out
# ---------------------------------------------------------------------------
# Per-PDF heterogeneity
# ---------------------------------------------------------------------------
def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
"""For each arm, per-PDF accuracy = correct/total questions on that PDF."""
bucket: dict[str, dict[str, list[bool]]] = {}
for r in rows:
arm = r["arm"]
pdf = r["doc_id"]
graded = r.get("graded") or {}
bucket.setdefault(arm, {}).setdefault(pdf, []).append(
bool(graded.get("correct"))
)
out: dict[str, dict] = {}
for arm, pdfs in bucket.items():
accs = [sum(b) / len(b) for b in pdfs.values() if b]
if not accs:
continue
out[arm] = {
"n_pdfs": len(accs),
"mean": statistics.mean(accs),
"std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
"min": min(accs),
"max": max(accs),
"p25": _percentile(accs, 25),
"p50": _percentile(accs, 50),
"p75": _percentile(accs, 75),
"n_pdfs_zero": sum(1 for a in accs if a == 0.0),
"n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
}
return out
# ---------------------------------------------------------------------------
# Latency / token distributions
# ---------------------------------------------------------------------------
def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
by_arm: dict[str, list[float]] = {}
for r in rows:
lat = r.get("latency_ms")
if lat is None or lat == 0:
continue
by_arm.setdefault(r["arm"], []).append(float(lat))
out: dict[str, dict] = {}
for arm, lats in by_arm.items():
out[arm] = {
"n": len(lats),
"mean_s": statistics.mean(lats) / 1000,
"std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
"p10_s": _percentile(lats, 10) / 1000,
"p25_s": _percentile(lats, 25) / 1000,
"p50_s": _percentile(lats, 50) / 1000,
"p75_s": _percentile(lats, 75) / 1000,
"p90_s": _percentile(lats, 90) / 1000,
"p95_s": _percentile(lats, 95) / 1000,
"p99_s": _percentile(lats, 99) / 1000,
"max_s": max(lats) / 1000,
# Coefficient of variation: std / mean (unitless tail-fatness).
"cv": (
statistics.stdev(lats) / statistics.mean(lats)
if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
),
}
return out
def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
by_arm_in: dict[str, list[float]] = {}
by_arm_out: dict[str, list[float]] = {}
for r in rows:
t_in = r.get("input_tokens") or 0
t_out = r.get("output_tokens") or 0
if t_in:
by_arm_in.setdefault(r["arm"], []).append(float(t_in))
if t_out:
by_arm_out.setdefault(r["arm"], []).append(float(t_out))
out: dict[str, dict] = {}
for arm in sorted(set(by_arm_in) | set(by_arm_out)):
in_vals = by_arm_in.get(arm, [])
out_vals = by_arm_out.get(arm, [])
if not in_vals and not out_vals:
continue
entry: dict = {}
if in_vals:
entry["input"] = {
"n": len(in_vals),
"mean": statistics.mean(in_vals),
"p50": _percentile(in_vals, 50),
"p95": _percentile(in_vals, 95),
"max": max(in_vals),
}
if out_vals:
entry["output"] = {
"n": len(out_vals),
"mean": statistics.mean(out_vals),
"p50": _percentile(out_vals, 50),
"p95": _percentile(out_vals, 95),
"max": max(out_vals),
}
out[arm] = entry
return out
# ---------------------------------------------------------------------------
# Pretty-printing
# ---------------------------------------------------------------------------
def _print_latency(title: str, lat: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
print(header)
print("-" * len(header))
for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
s = lat[arm]
print(f"{arm:<25} {s['n']:>4} "
f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
def _print_tokens(title: str, toks: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
f" {'out mean':>9} {'out p95':>9}")
print(header)
print("-" * len(header))
for arm in sorted(toks):
e = toks[arm]
ein = e.get("input")
eout = e.get("output")
if not ein:
continue
print(f"{arm:<25} "
f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f} "
f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
print(header)
print("-" * len(header))
for arm in sorted(var, key=lambda a: -var[a]["mean"]):
s = var[arm]
print(f"{arm:<25} {s['n_pdfs']:>7} "
f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
def _print_mcnemar(title: str, table: dict) -> None:
print()
print(title)
print("-" * len(title))
print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
print(header)
print("-" * len(header))
for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
sig = ""
if pair["p_value"] < 0.001:
sig = "***"
elif pair["p_value"] < 0.01:
sig = "**"
elif pair["p_value"] < 0.05:
sig = "*"
print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
f"{pair['p_value']:>13.4f} {sig:>4}")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
args = parser.parse_args()
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
raw_path = run_dir / "raw.jsonl"
post_path = run_dir / "raw_post_retry.jsonl"
if not raw_path.exists() or not post_path.exists():
raise SystemExit(
"Missing raw.jsonl or raw_post_retry.jsonl. "
"Run scripts/compute_post_retry_accuracy.py first."
)
raw_rows = _read_jsonl(raw_path)
post_rows = _read_jsonl(post_path)
print(f"Run: {args.run_id}")
print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
# Latency uses post-retry rows (post-retry rows include the retry's own
# latency for recovered rows). For raw, recovered rows have latency=0
# because the harness recorded a failure.
_print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
_print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
_print_pdf_var(
"Per-PDF accuracy heterogeneity (post-retry)",
_per_pdf_stats(post_rows),
)
_print_mcnemar(
"McNemar pairwise (RAW, no retries)",
_mcnemar_table(raw_rows),
)
_print_mcnemar(
"McNemar pairwise (POST-RETRY)",
_mcnemar_table(post_rows),
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,180 @@
"""Recompute per-arm accuracy/F1 after merging retry survivors into raw.jsonl.
Reads:
- data/multimodal_doc/runs/<run_id>/parser_compare/raw.jsonl
- data/multimodal_doc/runs/<run_id>/parser_compare/raw_retries.jsonl
For each (arm, qid) present in the retry artifact:
- if the retry RECOVERED, the retry row replaces the original row (same
grader is reused see ``mmlongbench/grader.py``);
- if the retry did NOT recover, the original row stays (still a failure,
so ``correct=False`` and ``f1=0``).
Prints two tables side by side:
* Raw run (no retries) matches §1 of the blog.
* Post-retry run final, "what would the headline have been if
the harness had had retries from day one".
It also writes ``data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl``
so any downstream notebook / report can join straight on it.
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
def _read_jsonl(path: Path) -> list[dict]:
out: list[dict] = []
with path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
out.append(json.loads(line))
return out
def _row_key(row: dict) -> tuple[str, str]:
return (str(row["arm"]), str(row["qid"]))
def _is_failure(row: dict) -> bool:
if row.get("error"):
return True
if not (row.get("raw_text") or "").strip():
return True
return False
def _summarise(rows_by_arm: dict[str, list[dict]]) -> dict[str, dict]:
out: dict[str, dict] = {}
for arm, rows in rows_by_arm.items():
n = len(rows)
n_correct = sum(1 for r in rows if r.get("graded", {}).get("correct"))
f1_sum = sum(float(r.get("graded", {}).get("f1") or 0.0) for r in rows)
n_fail = sum(1 for r in rows if _is_failure(r))
out[arm] = {
"n": n,
"n_correct": n_correct,
"n_failures": n_fail,
"accuracy": (n_correct / n) if n else 0.0,
"f1_mean": (f1_sum / n) if n else 0.0,
"failure_rate": (n_fail / n) if n else 0.0,
}
return out
def _print_table(title: str, summary: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = f"{'arm':<25} {'n':>4} {'n_corr':>7} {'acc':>7} {'F1':>7} {'fails':>6} {'fail%':>7}"
print(header)
print("-" * len(header))
# stable order: highest accuracy first
arms_sorted = sorted(summary.items(), key=lambda kv: -kv[1]["accuracy"])
for arm, s in arms_sorted:
print(f"{arm:<25} {s['n']:>4} {s['n_correct']:>7} "
f"{s['accuracy']*100:>6.1f}% {s['f1_mean']*100:>6.1f}% "
f"{s['n_failures']:>6} {s['failure_rate']*100:>6.1f}%")
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
args = parser.parse_args()
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
raw_path = run_dir / "raw.jsonl"
retry_path = run_dir / "raw_retries.jsonl"
out_path = run_dir / "raw_post_retry.jsonl"
if not raw_path.exists():
print(f"raw.jsonl not found at {raw_path}", file=sys.stderr)
return 1
if not retry_path.exists():
print(f"raw_retries.jsonl not found at {retry_path}", file=sys.stderr)
return 1
raw_rows = _read_jsonl(raw_path)
retry_rows = _read_jsonl(retry_path)
retry_by_key: dict[tuple[str, str], dict] = {
_row_key(r): r for r in retry_rows
}
merged_rows: list[dict] = []
n_replaced_recovered = 0
n_replaced_still_failed = 0
n_unchanged = 0
for row in raw_rows:
key = _row_key(row)
retry = retry_by_key.get(key)
if retry is None:
merged_rows.append(row)
n_unchanged += 1
continue
# The retry artifact carries a fresh ArmResult + grade in the same
# shape, plus a "retry" sub-object. We use the retry row whenever
# it represents a recovery; otherwise we keep the original (the
# retry confirms it is intrinsic, but the original row is the one
# the headline numbers were computed from, and the failure verdict
# is identical either way).
recovered = bool(retry.get("retry", {}).get("recovered"))
if recovered:
merged_rows.append(retry)
n_replaced_recovered += 1
else:
merged_rows.append(row)
n_replaced_still_failed += 1
# Persist merged jsonl for downstream consumers
with out_path.open("w", encoding="utf-8") as fh:
for r in merged_rows:
fh.write(json.dumps(r) + "\n")
# Bucket per arm
raw_by_arm: dict[str, list[dict]] = {}
for r in raw_rows:
raw_by_arm.setdefault(r["arm"], []).append(r)
post_by_arm: dict[str, list[dict]] = {}
for r in merged_rows:
post_by_arm.setdefault(r["arm"], []).append(r)
raw_summary = _summarise(raw_by_arm)
post_summary = _summarise(post_by_arm)
print()
print(f"Run: {args.run_id}")
print(f"Replaced (retry recovered): {n_replaced_recovered}")
print(f"Kept original (retry still failed): {n_replaced_still_failed}")
print(f"Untouched rows: {n_unchanged}")
print(f"Wrote merged artifact: {out_path.relative_to(REPO)}")
_print_table("Raw run (no retries)", raw_summary)
_print_table("Post-retry run (final)", post_summary)
print()
print("Delta (post-retry minus raw):")
print(f"{'arm':<25} {'d_acc':>7} {'d_fails':>8}")
print("-" * 42)
for arm in sorted(set(raw_summary) | set(post_summary)):
r = raw_summary.get(arm)
p = post_summary.get(arm)
if not r or not p:
continue
d_acc = (p["accuracy"] - r["accuracy"]) * 100
d_fail = p["n_failures"] - r["n_failures"]
print(f"{arm:<25} {d_acc:>+6.1f}p {d_fail:>+7d}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,59 @@
"""Inspect what the first 30 MMLongBench-Doc PDFs would look like for scoping.
Run from surfsense_evals/ root via:
python scripts/inspect_first30.py
Prints which docs are already ingested (existing 5), which are new (25 to
upload), how many questions cover those 30 PDFs, and the answerable /
unanswerable + format mix.
"""
from __future__ import annotations
import json
from collections import Counter
from pathlib import Path
def main() -> None:
qpath = Path("data/multimodal_doc/mmlongbench/questions.jsonl")
lines = qpath.read_text(encoding="utf-8").splitlines()
rows = [json.loads(line) for line in lines if line.strip()]
docs_by_id = sorted({r["doc_id"] for r in rows})
first30 = docs_by_id[:30]
existing5 = {
"05-03-18-political-release.pdf",
"0b85477387a9d0cc33fca0f4becaa0e5.pdf",
"0e94b4197b10096b1f4c699701570fbf.pdf",
"11-21-16-Updated-Post-Election-Release.pdf",
"12-15-15-ISIS-and-terrorism-release-final.pdf",
}
new25 = [d for d in first30 if d not in existing5]
print(
f"first 30 docs (alphabetical) — {len(new25)} new, "
f"{len(first30) - len(new25)} already in SurfSense"
)
qs_in_30 = [r for r in rows if r["doc_id"] in set(first30)]
fmts = Counter((r.get("answer_format") or "").lower() for r in qs_in_30)
answerable = sum(v for k, v in fmts.items() if k != "none")
unanswerable = fmts.get("none", 0)
print(
f"questions covering first 30 docs: total={len(qs_in_30)} "
f"answerable={answerable} unanswerable={unanswerable}"
)
print(
f"avg Qs/PDF: {len(qs_in_30) / 30:.1f} "
f"answerable/PDF: {answerable / 30:.1f}"
)
print(f"format mix in scope: {dict(fmts)}")
print()
print("25 new PDFs to ingest:")
for d in new25:
print(f" - {d}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,100 @@
"""Stub the mmlongbench manifest so parser_compare can extract in parallel.
The mmlongbench Surfsense ingest writes its manifest only at the very
end of the upload pipeline (~hours of celery work). parser_compare's
ingest, on the other hand, just needs a list of (doc_id, pdf_path)
tuples to know which PDFs to extract it doesn't care about the
SurfSense ``document_id`` (the runner does, later, after a refresh).
This script extends the existing manifest with the *additional* PDFs
that mmlongbench has already cached on disk (i.e. all 30 PDFs in
``data/multimodal_doc/mmlongbench/pdfs/`` even though only 5 have
SurfSense ``document_id``s yet) so parser_compare can run all four
extractions for them in parallel with the SurfSense ingest.
After mmlongbench finishes, re-run::
python -m surfsense_evals ingest multimodal_doc parser_compare \
--max-docs 30
to refresh ``parser_compare_doc_map.jsonl`` with the now-populated
``document_id`` values for the 25 new PDFs. The extractions
themselves are cached on disk so the second pass is essentially free.
"""
from __future__ import annotations
import json
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
MAP_PATH = REPO / "data" / "multimodal_doc" / "maps" / "mmlongbench_doc_map.jsonl"
PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
QUESTIONS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
def _question_count_per_doc() -> dict[str, int]:
counts: dict[str, int] = {}
with QUESTIONS.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
counts[row["doc_id"]] = counts.get(row["doc_id"], 0) + 1
return counts
def main() -> None:
if not MAP_PATH.exists():
raise SystemExit(
f"manifest not found at {MAP_PATH}"
"run `surfsense_evals ingest multimodal_doc mmlongbench` first."
)
existing_lines = MAP_PATH.read_text(encoding="utf-8").splitlines()
existing_rows: list[dict] = []
settings_line = None
for line in existing_lines:
line = line.strip()
if not line:
continue
row = json.loads(line)
if "__settings__" in row:
settings_line = line
else:
existing_rows.append(row)
by_doc_id = {r["doc_id"]: r for r in existing_rows}
counts = _question_count_per_doc()
cached_pdfs = sorted(p for p in PDF_DIR.glob("*.pdf"))
print(f"existing manifest entries: {len(existing_rows)}")
print(f"cached PDFs on disk: {len(cached_pdfs)}")
added = 0
for pdf in cached_pdfs:
if pdf.name in by_doc_id:
continue
by_doc_id[pdf.name] = {
"doc_id": pdf.name,
"document_id": None,
"pdf_path": str(pdf),
"n_questions": counts.get(pdf.name, 0),
}
added += 1
out_lines: list[str] = []
if settings_line:
out_lines.append(settings_line)
for doc_id in sorted(by_doc_id):
out_lines.append(json.dumps(by_doc_id[doc_id]))
MAP_PATH.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
print(f"added {added} stub rows; manifest now has {len(by_doc_id)} PDFs")
print(f"wrote: {MAP_PATH}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,37 @@
"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
from __future__ import annotations
import glob
import json
from collections import defaultdict
def main() -> None:
raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
print(f"Reading: {raw_path}")
rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
by_q: dict[str, dict[str, dict]] = defaultdict(dict)
for r in rows:
by_q[r["qid"]][r["arm"]] = r
for qid, arms in list(by_q.items()):
b = arms.get("bare_llm", {})
l = arms.get("long_context", {})
s = arms.get("surfsense", {})
print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
print(f" question: {b.get('extra', {}).get('question', '?')!r}")
print(f" gold: {b.get('gold')!r}")
for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
grade = a.get("graded", {})
text = (a.get("raw_text") or "").strip()
tail = text[-200:] if text else ""
print(
f" [{arm_name}] grade={grade.get('grade')} "
f"method={grade.get('method')}"
)
print(f" -> {tail!r}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,64 @@
"""Show questions where SurfSense was wrong but long-context was right (and vice versa)."""
from __future__ import annotations
import glob
import json
from collections import defaultdict
def main() -> None:
raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
print(f"Reading: {raw_path}")
rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
by_q: dict[str, dict[str, dict]] = defaultdict(dict)
for r in rows:
by_q[r["qid"]][r["arm"]] = r
surf_wrong_lc_right = []
lc_wrong_surf_right = []
surf_wrong_bare_right = []
for qid, arms in by_q.items():
b = arms.get("bare_llm", {}).get("graded", {}).get("grade")
lc = arms.get("long_context", {}).get("graded", {}).get("grade")
s = arms.get("surfsense", {}).get("graded", {}).get("grade")
if s == "incorrect" and lc == "correct":
surf_wrong_lc_right.append(qid)
if lc == "incorrect" and s == "correct":
lc_wrong_surf_right.append(qid)
if s == "incorrect" and b == "correct":
surf_wrong_bare_right.append(qid)
print(f"\nSurfSense INCORRECT but Long-Context CORRECT: {len(surf_wrong_lc_right)}")
print(f"Long-Context INCORRECT but SurfSense CORRECT: {len(lc_wrong_surf_right)}")
print(f"SurfSense INCORRECT but Bare CORRECT: {len(surf_wrong_bare_right)}")
print("\n=== Where SurfSense is wrong but long-context is right (top 5) ===")
for qid in surf_wrong_lc_right[:5]:
arms = by_q[qid]
b = arms.get("bare_llm", {})
print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
print(f" GOLD: {b.get('gold')!r}")
for arm_name in ("bare_llm", "long_context", "surfsense"):
a = arms.get(arm_name, {})
t = (a.get("raw_text") or "").strip()
tail = t[-180:] if t else ""
grade = a.get("graded", {})
print(f" [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
print("\n=== Where Long-Context is wrong but SurfSense is right (top 5) ===")
for qid in lc_wrong_surf_right[:5]:
arms = by_q[qid]
b = arms.get("bare_llm", {})
print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
print(f" GOLD: {b.get('gold')!r}")
for arm_name in ("bare_llm", "long_context", "surfsense"):
a = arms.get(arm_name, {})
t = (a.get("raw_text") or "").strip()
tail = t[-180:] if t else ""
grade = a.get("graded", {})
print(f" [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,636 @@
"""Retry only the failed (arm, question) pairs from a previous parser_compare run.
The original parser_compare run records one row per (arm, qid) in
``raw.jsonl``. Some of those rows came back with transient transport
errors (SSL alerts, gateway 502s, empty SSE streams) or empty
``raw_text``. This script re-issues *only* those calls with exponential
backoff so we can see how many recover.
Design constraints / choices:
* **No re-ingest.** All cached PDFs and parser-extracted markdown stay
on disk. We rebuild ``ArmRequest`` objects from the existing manifest
+ the original ``mmlongbench/questions.jsonl``.
* **No SurfSense backend or celery required.** SurfSense had 0
reported failures; this script will skip any ``surfsense_agentic``
rows it encounters and warn rather than try to start the backend.
* **Original ``raw.jsonl`` is never mutated.** Retries land in a
sibling ``raw_retries.jsonl`` so the original artifact stays
citeable.
* **Idempotent.** Re-running this script re-tries the same set of
failed rows from ``raw.jsonl``. If you want to merge survivor rows
back in, do that as a separate aggregation step.
Usage:
python scripts/retry_failed_questions.py \
--run-id 2026-05-14T00-53-19Z \
--max-attempts 5 \
--concurrency 2
Outputs (written next to the original raw.jsonl):
* ``raw_retries.jsonl`` one line per retried (arm, qid). Each line
carries the original error, every retry attempt's timing/error,
and the final result (incl. grade) so you can drop it straight
into a notebook.
* ``raw_retries_summary.json`` per-arm tried/recovered/still-failed
counts and an aggregated retry-success rate.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import os
import random
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
REPO = Path(__file__).resolve().parents[1]
SRC = REPO / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from dotenv import load_dotenv # noqa: E402
from surfsense_evals.core.arms import ( # noqa: E402
ArmRequest,
ArmResult,
BareLlmArm,
NativePdfArm,
)
from surfsense_evals.core.parse.freeform_answer import ( # noqa: E402
extract_freeform_answer,
)
from surfsense_evals.core.providers.openrouter_chat import ( # noqa: E402
OpenRouterChatProvider,
)
from surfsense_evals.core.providers.openrouter_pdf import ( # noqa: E402
OpenRouterPdfProvider,
PdfEngine,
)
from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade # noqa: E402
from surfsense_evals.suites.multimodal_doc.parser_compare.prompt import ( # noqa: E402
build_long_context_prompt,
build_native_pdf_prompt,
)
logger = logging.getLogger("retry_failed_questions")
LC_ARMS = {
"azure_basic_lc",
"azure_premium_lc",
"llamacloud_basic_lc",
"llamacloud_premium_lc",
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _is_failure_row(row: dict[str, Any]) -> bool:
"""A row counts as failed if it raised an error OR returned empty text.
We retry both because the empty-stream case is the same operational
failure mode (the call returned nothing usable) we just didn't
raise it as an exception.
"""
if row.get("error"):
return True
if not (row.get("raw_text") or "").strip():
return True
return False
@dataclass
class FailedRow:
arm: str
qid: str
doc_id: str
answer_format: str
gold: str
pages: int
document_id: int | None
original_error: str | None
original_row: dict[str, Any]
def _load_failed_rows(raw_path: Path) -> list[FailedRow]:
out: list[FailedRow] = []
with raw_path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
if not _is_failure_row(row):
continue
out.append(FailedRow(
arm=str(row["arm"]),
qid=str(row["qid"]),
doc_id=str(row["doc_id"]),
answer_format=str(row.get("answer_format") or ""),
gold=str(row.get("gold") or ""),
pages=int(row.get("pages") or 0),
document_id=row.get("document_id"),
original_error=row.get("error"),
original_row=row,
))
return out
def _load_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
out: dict[str, dict[str, Any]] = {}
with map_path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
out[str(row["doc_id"])] = row
return out
def _load_question_text_index(
questions_jsonl: Path,
) -> dict[tuple[str, int], dict[str, Any]]:
"""Map (doc_id, per_doc_index) -> raw question row.
qids in raw.jsonl are formatted ``{doc_id}::Q{NNN}`` where NNN is
the per-doc index. Reproducing the runner's question selection
requires walking ``questions.jsonl`` in order and assigning
indices per doc_id (so we match the runner's ``per_doc_idx`` logic
in ``_select_questions``).
"""
out: dict[tuple[str, int], dict[str, Any]] = {}
per_doc_idx: dict[str, int] = {}
with questions_jsonl.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
doc_id = str(row.get("doc_id") or "")
if not doc_id:
continue
idx = per_doc_idx.get(doc_id, 0)
per_doc_idx[doc_id] = idx + 1
out[(doc_id, idx)] = row
return out
def _qid_index(qid: str) -> int:
"""Parse the per-doc question index out of a qid like ``foo.pdf::Q007``."""
_, _, q_part = qid.rpartition("::")
if not q_part.startswith("Q"):
raise ValueError(f"unexpected qid shape: {qid!r}")
return int(q_part[1:])
# ---------------------------------------------------------------------------
# Request building (mirrors runner.py exactly so prompts are byte-identical)
# ---------------------------------------------------------------------------
def _build_native_request(
qid: str, question: str, answer_format: str, pdf_path: Path,
*, max_output_tokens: int,
) -> ArmRequest:
return ArmRequest(
question_id=qid,
prompt=build_native_pdf_prompt(question, answer_format=answer_format),
pdf_paths=[pdf_path],
options={"max_tokens": max_output_tokens},
)
def _build_lc_request(
qid: str, question: str, answer_format: str, doc_id: str, md_path: Path,
) -> ArmRequest:
if not md_path.exists():
raise FileNotFoundError(
f"Missing parser extraction at {md_path}; cannot retry LC arm."
)
markdown = md_path.read_text(encoding="utf-8")
return ArmRequest(
question_id=qid,
prompt=build_long_context_prompt(
question,
answer_format=answer_format,
document_markdown=markdown,
document_label=doc_id,
),
)
# ---------------------------------------------------------------------------
# Retry driver
# ---------------------------------------------------------------------------
@dataclass
class AttemptLog:
attempt: int
started_iso: str
latency_ms: int
error: str | None
raw_text_chars: int
@dataclass
class RetryOutcome:
arm: str
qid: str
attempts: list[AttemptLog]
final_result: ArmResult
recovered: bool
async def _retry_one(
arm_obj: Any, request: ArmRequest, *,
arm_name: str,
qid: str,
max_attempts: int,
base_delay: float,
max_delay: float,
) -> RetryOutcome:
attempts: list[AttemptLog] = []
final: ArmResult | None = None
for attempt in range(1, max_attempts + 1):
started_iso = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
t0 = time.monotonic()
result = await arm_obj.answer(request)
latency_ms = int((time.monotonic() - t0) * 1000)
raw_text = (result.raw_text or "").strip()
attempt_error = result.error
if not attempt_error and not raw_text:
attempt_error = "EmptyResponse: stream ended with no text"
attempts.append(AttemptLog(
attempt=attempt,
started_iso=started_iso,
latency_ms=latency_ms,
error=attempt_error,
raw_text_chars=len(raw_text),
))
final = result
if not attempt_error and raw_text:
return RetryOutcome(
arm=arm_name, qid=qid, attempts=attempts,
final_result=result, recovered=True,
)
if attempt < max_attempts:
delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
delay = delay * (0.5 + random.random())
logger.info(
"[%s::%s] attempt %d/%d failed (%s); sleeping %.1fs",
arm_name, qid, attempt, max_attempts, attempt_error, delay,
)
await asyncio.sleep(delay)
assert final is not None
return RetryOutcome(
arm=arm_name, qid=qid, attempts=attempts,
final_result=final, recovered=False,
)
async def _gather_with_limit(coros: list, *, concurrency: int) -> list[Any]:
sem = asyncio.Semaphore(max(1, concurrency))
async def _wrap(coro):
async with sem:
return await coro
return await asyncio.gather(*(_wrap(c) for c in coros))
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
async def _run(args: argparse.Namespace) -> int:
load_dotenv(REPO / ".env")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
raw_path = run_dir / "raw.jsonl"
if not raw_path.exists():
raise SystemExit(f"raw.jsonl not found at {raw_path}")
map_path = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
questions_jsonl = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
if not map_path.exists():
raise SystemExit(f"parser_compare manifest not found at {map_path}")
if not questions_jsonl.exists():
raise SystemExit(f"mmlongbench questions not found at {questions_jsonl}")
failed = _load_failed_rows(raw_path)
if not failed:
logger.info("No failed rows in %s — nothing to retry.", raw_path)
return 0
# SurfSense rows: warn and skip; we don't want to start backend just to
# defensively retry a 0-failure arm.
surf_failed = [f for f in failed if f.arm == "surfsense_agentic"]
if surf_failed:
logger.warning(
"Skipping %d surfsense_agentic failures; this script doesn't drive the backend. "
"If you want those retried too, start backend + celery and rerun "
"with --include-surfsense.",
len(surf_failed),
)
if not args.include_surfsense:
failed = [f for f in failed if f.arm != "surfsense_agentic"]
else:
logger.info("No surfsense_agentic failures; backend/celery not needed for this retry.")
if not failed:
logger.info("Nothing left to retry after filtering.")
return 0
by_arm_count: dict[str, int] = {}
for f in failed:
by_arm_count[f.arm] = by_arm_count.get(f.arm, 0) + 1
logger.info(
"Loaded %d failed rows across %d arms: %s",
len(failed), len(by_arm_count),
", ".join(f"{a}={n}" for a, n in sorted(by_arm_count.items())),
)
doc_map = _load_doc_map(map_path)
qtext_idx = _load_question_text_index(questions_jsonl)
api_key = os.environ.get("OPENROUTER_API_KEY")
if not api_key:
raise SystemExit("OPENROUTER_API_KEY missing from environment / .env")
native_provider = OpenRouterPdfProvider(
api_key=api_key,
base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
model=args.llm_model,
engine=PdfEngine(args.pdf_engine),
)
native_arm = NativePdfArm(
provider=native_provider, max_output_tokens=args.max_output_tokens,
)
lc_arms: dict[str, BareLlmArm] = {}
for arm_name in sorted({f.arm for f in failed} & LC_ARMS):
lc_provider = OpenRouterChatProvider(
api_key=api_key,
base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
model=args.llm_model,
)
lc_arms[arm_name] = BareLlmArm(
provider=lc_provider,
max_output_tokens=args.max_output_tokens,
name=arm_name,
)
coros: list = []
plan: list[tuple[FailedRow, ArmRequest, Any]] = []
for f in failed:
# Look up the question text from questions.jsonl
try:
q_idx = _qid_index(f.qid)
except Exception:
logger.error("Bad qid %r — skipping", f.qid)
continue
qrow = qtext_idx.get((f.doc_id, q_idx))
if qrow is None:
logger.error(
"Could not find question text for %s (idx %d) — skipping",
f.doc_id, q_idx,
)
continue
question_text = str(qrow.get("question") or "").strip()
answer_format = str(qrow.get("answer_format") or f.answer_format or "").strip().lower()
map_row = doc_map.get(f.doc_id)
if map_row is None:
logger.error("doc_id %s not in manifest — skipping", f.doc_id)
continue
if f.arm == "native_pdf":
pdf_path = Path(map_row["pdf_path"])
if not pdf_path.exists():
logger.error("PDF missing on disk: %s — skipping", pdf_path)
continue
request = _build_native_request(
f.qid, question_text, answer_format, pdf_path,
max_output_tokens=args.max_output_tokens,
)
arm_obj = native_arm
elif f.arm in LC_ARMS:
ext_blob = (map_row.get("extractions") or {}).get(f.arm) or {}
md_path_str = ext_blob.get("markdown_path")
if not md_path_str or ext_blob.get("status") != "ok":
logger.error(
"Missing extraction for %s on %s — cannot retry; skipping",
f.arm, f.doc_id,
)
continue
request = _build_lc_request(
f.qid, question_text, answer_format, f.doc_id, Path(md_path_str),
)
arm_obj = lc_arms[f.arm]
else:
logger.warning("Unhandled arm %s — skipping", f.arm)
continue
plan.append((f, request, arm_obj))
coros.append(_retry_one(
arm_obj, request,
arm_name=f.arm, qid=f.qid,
max_attempts=args.max_attempts,
base_delay=args.base_delay,
max_delay=args.max_delay,
))
if not coros:
logger.warning("Nothing to retry after request building.")
return 0
logger.info(
"Retrying %d failed rows with up to %d attempts each "
"(base_delay=%.1fs, max_delay=%.1fs, concurrency=%d).",
len(coros), args.max_attempts, args.base_delay, args.max_delay,
args.concurrency,
)
started = time.monotonic()
outcomes: list[RetryOutcome] = await _gather_with_limit(
coros, concurrency=args.concurrency,
)
elapsed = time.monotonic() - started
logger.info("Retry pass finished in %.1fs.", elapsed)
out_path = run_dir / "raw_retries.jsonl"
summary_path = run_dir / "raw_retries_summary.json"
per_arm_recovered: dict[str, int] = {}
per_arm_total: dict[str, int] = {}
per_arm_attempts_dist: dict[str, list[int]] = {}
with out_path.open("w", encoding="utf-8") as fh:
for (f, _req, _arm_obj), outcome in zip(plan, outcomes, strict=True):
per_arm_total[outcome.arm] = per_arm_total.get(outcome.arm, 0) + 1
if outcome.recovered:
per_arm_recovered[outcome.arm] = (
per_arm_recovered.get(outcome.arm, 0) + 1
)
per_arm_attempts_dist.setdefault(outcome.arm, []).append(
len(outcome.attempts)
)
g = grade(
pred=extract_freeform_answer(outcome.final_result.raw_text or ""),
gold=f.gold,
answer_format=f.answer_format,
)
row = {
"qid": f.qid,
"doc_id": f.doc_id,
"arm": f.arm,
"answer_format": f.answer_format,
"gold": f.gold,
"pages": f.pages,
"document_id": f.document_id,
"original_error": f.original_error,
"retry": {
"max_attempts": args.max_attempts,
"n_attempts": len(outcome.attempts),
"recovered": outcome.recovered,
"attempts": [
{
"attempt": a.attempt,
"started_iso": a.started_iso,
"latency_ms": a.latency_ms,
"error": a.error,
"raw_text_chars": a.raw_text_chars,
}
for a in outcome.attempts
],
},
**outcome.final_result.to_jsonl(),
"graded": {
"correct": g.correct,
"f1": g.f1,
"method": g.method,
"normalised_pred": g.normalised_pred,
"normalised_gold": g.normalised_gold,
},
}
fh.write(json.dumps(row) + "\n")
summary = {
"run_id": args.run_id,
"raw_retries_path": str(out_path.relative_to(REPO)),
"n_failed_rows_input": len(failed),
"n_retried": len(coros),
"elapsed_s": round(elapsed, 1),
"config": {
"max_attempts": args.max_attempts,
"base_delay": args.base_delay,
"max_delay": args.max_delay,
"concurrency": args.concurrency,
"llm_model": args.llm_model,
"pdf_engine": args.pdf_engine,
"max_output_tokens": args.max_output_tokens,
},
"per_arm": {
arm: {
"tried": per_arm_total.get(arm, 0),
"recovered": per_arm_recovered.get(arm, 0),
"still_failed": (
per_arm_total.get(arm, 0) - per_arm_recovered.get(arm, 0)
),
"recovery_rate": (
per_arm_recovered.get(arm, 0) / per_arm_total[arm]
if per_arm_total.get(arm) else 0.0
),
"attempts_distribution": sorted(per_arm_attempts_dist.get(arm, [])),
}
for arm in sorted(per_arm_total)
},
"totals": {
"tried": sum(per_arm_total.values()),
"recovered": sum(per_arm_recovered.values()),
"still_failed": sum(per_arm_total.values()) - sum(per_arm_recovered.values()),
},
}
summary_path.write_text(
json.dumps(summary, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
print()
print("=" * 78)
print("Retry pass summary")
print("=" * 78)
header = f"{'arm':<25} {'tried':>6} {'recovered':>10} {'still fail':>11} {'rate':>7}"
print(header)
print("-" * len(header))
for arm in sorted(per_arm_total):
tried = per_arm_total[arm]
rec = per_arm_recovered.get(arm, 0)
rate = (rec / tried * 100) if tried else 0.0
print(f"{arm:<25} {tried:>6} {rec:>10} {tried - rec:>11} {rate:>6.1f}%")
total = sum(per_arm_total.values())
rec_total = sum(per_arm_recovered.values())
rate_total = (rec_total / total * 100) if total else 0.0
print("-" * len(header))
print(f"{'TOTAL':<25} {total:>6} {rec_total:>10} {total - rec_total:>11} "
f"{rate_total:>6.1f}%")
print()
print(f"Wrote {out_path.relative_to(REPO)}")
print(f"Wrote {summary_path.relative_to(REPO)}")
return 0
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--run-id", default="2026-05-14T00-53-19Z",
help="Run timestamp under data/multimodal_doc/runs/. Default is the "
"n=171 production run we wrote up in the blog.",
)
parser.add_argument("--max-attempts", type=int, default=5)
parser.add_argument("--base-delay", type=float, default=1.0,
help="Base seconds for exponential backoff (default 1s).")
parser.add_argument("--max-delay", type=float, default=30.0,
help="Cap on per-retry sleep (default 30s).")
parser.add_argument("--concurrency", type=int, default=2,
help="Parallel retries in flight (default 2 — keep low "
"to avoid the same transport stress that caused "
"the original failures).")
parser.add_argument("--llm-model", default="anthropic/claude-sonnet-4.5")
parser.add_argument("--pdf-engine", default="native",
choices=[e.value for e in PdfEngine])
parser.add_argument("--max-output-tokens", type=int, default=512)
parser.add_argument(
"--include-surfsense", action="store_true",
help="Also retry surfsense_agentic failures (requires backend + celery up). "
"Default is to skip them since the n=171 run had 0 SurfSense failures.",
)
args = parser.parse_args()
raise SystemExit(asyncio.run(_run(args)))
if __name__ == "__main__":
main()

View file

@ -0,0 +1,65 @@
"""Render a quick textual summary of the latest CRAG run."""
from __future__ import annotations
import glob
import json
def main() -> None:
runs = sorted(glob.glob("data/research/runs/*/crag/run_artifact.json"))
if not runs:
print("(no CRAG runs found)")
return
m = json.load(open(runs[-1], encoding="utf-8"))
metrics = m["metrics"]
print(f"Reading: {runs[-1]}")
print(f"n_questions: {m['extra']['n_questions']}")
print()
print("=== ARMS ===")
for arm in ("bare_llm", "long_context", "surfsense"):
d = metrics[arm]
print(
f"{arm:14s}: "
f"acc={d['accuracy']*100:5.1f}% (Wilson 95% CI "
f"{d['ci_low']*100:.1f}-{d['ci_high']*100:.1f}) | "
f"correct={d['correct_rate']*100:5.1f}% "
f"missing={d['missing_rate']*100:5.1f}% "
f"incorrect={d['incorrect_rate']*100:5.1f}% | "
f"truth={d['truthfulness_score']*100:+5.1f}%"
)
print()
print("=== DELTAS ===")
for key, d in metrics["deltas"].items():
print(
f"{key:30s}: acc={d['accuracy_pp']:+5.1f}pp "
f"truth={d['truthfulness_score_pp']:+5.1f}pp "
f"McNemar p={d['mcnemar_p_value']:.4f} ({d['mcnemar_method']}) "
f"bootstrap CI [{d['bootstrap_ci_low']:+.1f}, {d['bootstrap_ci_high']:+.1f}]"
)
print()
print("=== PER-QUESTION-TYPE TRUTHFULNESS ===")
for qt, row in sorted(metrics["per_question_type"].items()):
n = row["n"]
pieces = [f"{qt:20s} (n={n:3d}):"]
for arm in ("bare_llm", "long_context", "surfsense"):
if arm in row:
pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
print(" ".join(pieces))
print()
print("=== PER-DOMAIN TRUTHFULNESS ===")
for dom, row in sorted(metrics["per_domain"].items()):
n = row["n"]
pieces = [f"{dom:10s} (n={n:3d}):"]
for arm in ("bare_llm", "long_context", "surfsense"):
if arm in row:
pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
print(" ".join(pieces))
if __name__ == "__main__":
main()

View file

@ -0,0 +1,122 @@
"""Slice the parser_compare raw.jsonl for the n=171 run.
Reports per-arm:
* tokens & cost stats (input/output mean, $/Q distribution)
* failures (status != ok or empty raw_text)
* answer_format breakdown (accuracy by str/int/float/list)
Plus surfsense agentic breakdown so we can compare apples to apples
even though the new_chat SSE doesn't surface per-call token counts.
"""
from __future__ import annotations
import json
import statistics
from collections import defaultdict
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
RUN_DIR = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN_DIR / "raw.jsonl"
ARTIFACT = RUN_DIR / "run_artifact.json"
def main() -> None:
rows = [json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines() if line.strip()]
print(f"raw rows: {len(rows)}")
by_qid: dict[str, list[dict]] = defaultdict(list)
for row in rows:
by_qid[row["qid"]].append(row)
print(f"unique questions: {len(by_qid)}")
arm_metrics: dict[str, dict] = defaultdict(lambda: {
"n": 0, "n_correct": 0, "n_failed": 0, "n_empty": 0,
"costs": [], "in_tokens": [], "out_tokens": [], "latency_ms": [],
"by_format": defaultdict(lambda: {"n": 0, "correct": 0}),
})
for row in rows:
arm = row["arm"]
m = arm_metrics[arm]
m["n"] += 1
graded = row.get("graded") or {}
if graded.get("correct"):
m["n_correct"] += 1
err = row.get("error")
raw_text = row.get("raw_text") or ""
if err:
m["n_failed"] += 1
elif not raw_text.strip():
m["n_empty"] += 1
cost = row.get("cost_usd")
if cost is not None:
m["costs"].append(float(cost))
ut = row.get("usage") or {}
if ut.get("prompt_tokens"):
m["in_tokens"].append(ut["prompt_tokens"])
if ut.get("completion_tokens"):
m["out_tokens"].append(ut["completion_tokens"])
if row.get("latency_ms"):
m["latency_ms"].append(row["latency_ms"])
fmt = row.get("answer_format") or "unknown"
m["by_format"][fmt]["n"] += 1
if graded.get("correct"):
m["by_format"][fmt]["correct"] += 1
print()
print("=" * 100)
print(f"{'arm':<25} {'n':>4} {'acc%':>6} {'F1%':>6} {'fail':>5} {'$ mean':>10} {'$ median':>10} {'in tok mean':>12} {'out tok mean':>12} {'p50 ms':>8}")
print("=" * 100)
art = json.loads(ARTIFACT.read_text(encoding="utf-8"))
per_arm_art = art["metrics"]["per_arm"]
for arm, m in sorted(arm_metrics.items()):
acc = m["n_correct"] / m["n"] * 100
fail = m["n_failed"]
cost_mean = statistics.mean(m["costs"]) if m["costs"] else 0.0
cost_med = statistics.median(m["costs"]) if m["costs"] else 0.0
in_mean = statistics.mean(m["in_tokens"]) if m["in_tokens"] else 0
out_mean = statistics.mean(m["out_tokens"]) if m["out_tokens"] else 0
lat_p50 = statistics.median(m["latency_ms"]) if m["latency_ms"] else 0
f1 = per_arm_art.get(arm, {}).get("f1_mean", 0.0) * 100
print(
f"{arm:<25} {m['n']:>4} {acc:>5.1f}% {f1:>5.1f}% {fail:>5} "
f"${cost_mean:>9.4f} ${cost_med:>9.4f} {in_mean:>12.0f} {out_mean:>12.0f} {lat_p50:>8.0f}"
)
print()
print("by answer_format (accuracy):")
formats = sorted({f for m in arm_metrics.values() for f in m["by_format"].keys()})
header = f"{'arm':<25} " + " ".join(f"{f:>10}" for f in formats)
print(header)
print("-" * len(header))
for arm, m in sorted(arm_metrics.items()):
cells = []
for f in formats:
row = m["by_format"][f]
if row["n"] == 0:
cells.append(f"{'-':>10}")
else:
pct = row["correct"] / row["n"] * 100
cells.append(f"{pct:>5.0f}% ({row['correct']:>2}/{row['n']:>2})")
print(f"{arm:<25} " + " ".join(cells))
print()
print("=" * 100)
print("Aggregated cost (from run_artifact.json):")
for arm, row in per_arm_art.items():
print(
f" {arm:<25} acc={row['accuracy']*100:5.1f}% "
f" $/Q LLM={row['llm_cost_per_q']:.4f} "
f" preprocess total=${row['preprocess_cost_total']:.2f} "
f" $/Q total={row['total_cost_per_q']:.4f}"
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,155 @@
"""Test the hypothesis: were the LC-arm errors actually context-window
overflow errors disguised as SSL / network failures?
If true, we'd expect:
(a) literal "prompt is too long" / "context_length_exceeded" / "exceeds .* tokens" strings,
(b) failures correlated with extraction size / input_tokens (large doc -> failure),
(c) failing requests near or over Sonnet 4.5's 200k input-token limit.
If false (transport-layer hypothesis), we'd expect:
(a) only SSL / 502 / empty stream / JSONDecode strings,
(b) failures NOT correlated with size (uniform across PDFs by time, not by tokens),
(c) failing requests well below the 200k limit.
"""
from __future__ import annotations
import json
import statistics
from collections import defaultdict
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"
MANIFEST = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
CONTEXT_HINTS = (
"context_length",
"context window",
"prompt is too long",
"exceeds",
"maximum context",
"input tokens",
"too many tokens",
"over the maximum",
"200000",
"200_000",
)
def main() -> None:
rows = [
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
if line.strip()
]
extraction_size: dict[tuple[str, str], int] = {}
for line in MANIFEST.read_text(encoding="utf-8").splitlines():
if not line.strip():
continue
m = json.loads(line)
for arm, ext in (m.get("extractions") or {}).items():
extraction_size[(m["doc_id"], arm)] = int(ext.get("chars") or 0)
print("=" * 80)
print("(a) Literal 'context window' / 'prompt too long' error strings?")
print("=" * 80)
found = 0
for row in rows:
err = (row.get("error") or "").lower()
if not err:
continue
for hint in CONTEXT_HINTS:
if hint in err:
print(f" {row['arm']:<25} {row['qid']:<50}")
print(f" -> {err[:240]}")
found += 1
break
if not found:
print(" none found.")
print()
print("=" * 80)
print("(b) Extraction size for OK vs FAILED rows per arm")
print("=" * 80)
arm_buckets: dict[str, dict[str, list[int]]] = defaultdict(
lambda: {"ok": [], "fail": []}
)
parser_arms = (
"azure_basic_lc", "azure_premium_lc",
"llamacloud_basic_lc", "llamacloud_premium_lc",
)
for row in rows:
arm = row["arm"]
if arm not in parser_arms:
continue
size = extraction_size.get((row["doc_id"], arm), 0)
bucket = "fail" if (row.get("error") or not (row.get("raw_text") or "").strip()) else "ok"
arm_buckets[arm][bucket].append(size)
print(f"{'arm':<25} {'bucket':<5} {'n':>4} {'mean chars':>12} {'median':>10} {'max':>10}")
for arm in parser_arms:
for bucket in ("ok", "fail"):
sizes = arm_buckets[arm][bucket]
if not sizes:
print(f" {arm:<23} {bucket:<5} {0:>4} -")
continue
print(
f" {arm:<23} {bucket:<5} {len(sizes):>4} "
f"{statistics.mean(sizes):>12,.0f} "
f"{statistics.median(sizes):>10,.0f} "
f"{max(sizes):>10,}"
)
print()
print("=" * 80)
print("(c) Largest extraction each arm processed *successfully* vs *failed*")
print("=" * 80)
print(
"(Sonnet 4.5 input limit ~200k tokens ~= 800k chars. If failures were "
"context-overflow, max-OK would be near that cap. If max-OK is well "
"above max-FAIL, the model handled bigger contexts than the failed "
"ones, so size cannot be the cause.)"
)
print()
for arm in parser_arms:
ok_sizes = arm_buckets[arm]["ok"]
fail_sizes = arm_buckets[arm]["fail"]
if not ok_sizes:
continue
max_ok = max(ok_sizes)
max_fail = max(fail_sizes) if fail_sizes else 0
print(
f" {arm:<25} max OK = {max_ok:>10,} chars (~{max_ok / 4:>7,.0f} tokens) "
f"max FAIL = {max_fail:>10,} chars (~{max_fail / 4:>7,.0f} tokens)"
)
print()
print("=" * 80)
print("(d) Did the *known* overflow candidate fail?")
print("=" * 80)
print(
" 3M_2018_10K x llamacloud_premium = 908,733 chars (~227k tokens) "
"-- this is above Sonnet 4.5's 200k window."
)
print(" If transport hypothesis is correct, this should still fail with a "
"real overflow error.")
print(" If transport hypothesis is correct AND the model truncates silently, "
"it might 'succeed' but be wrong.")
print()
for row in rows:
if row["doc_id"] != "3M_2018_10K.pdf":
continue
if row["arm"] != "llamacloud_premium_lc":
continue
err = row.get("error") or "(none)"
graded = row.get("graded") or {}
print(
f" {row['qid']:<40} correct={graded.get('correct')!s:<5} "
f"err={err[:100]}"
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,10 @@
"""SurfSense Evals — domain-agnostic eval harness.
Public entry-point is the ``surfsense_evals`` CLI (``python -m surfsense_evals``).
Programmatic embedding is a non-goal for now; everything goes through the CLI
+ filesystem outputs (state.json, raw run JSONL, summary.md/json reports).
"""
from __future__ import annotations
__version__ = "0.1.0"

View file

@ -0,0 +1,13 @@
"""Module entry point: ``python -m surfsense_evals ...``.
Delegates to ``core.cli.main``. ``core.cli`` lazily imports
``surfsense_evals.suites`` so every benchmark gets a chance to register
before argparse builds its subcommand groups.
"""
from __future__ import annotations
from surfsense_evals.core.cli import main
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main())

View file

@ -0,0 +1,8 @@
"""Domain-agnostic infrastructure shared by every suite.
Nothing under ``core/`` knows or cares about a specific evaluation domain.
Suites live under ``surfsense_evals.suites.<domain>.<benchmark>`` and
register themselves with ``core.registry`` on import.
"""
from __future__ import annotations

View file

@ -0,0 +1,44 @@
"""Arm protocol + concrete arms shared across suites.
Concrete arms (``NativePdfArm``, ``SurfSenseArm``, ``BareLlmArm``) are
imported lazily via ``__getattr__`` so consumers that only need the
protocol e.g. the registry's ``Arm`` re-export — don't transitively
pull in ``httpx`` providers or the SurfSense client unless they
actually use those arms.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from .base import Arm, ArmRequest, ArmResult
if TYPE_CHECKING: # pragma: no cover
from .bare_llm import BareLlmArm
from .native_pdf import NativePdfArm
from .surfsense import SurfSenseArm
__all__ = [
"Arm",
"ArmRequest",
"ArmResult",
"BareLlmArm",
"NativePdfArm",
"SurfSenseArm",
]
def __getattr__(name: str): # PEP 562
if name == "NativePdfArm":
from .native_pdf import NativePdfArm
return NativePdfArm
if name == "SurfSenseArm":
from .surfsense import SurfSenseArm
return SurfSenseArm
if name == "BareLlmArm":
from .bare_llm import BareLlmArm
return BareLlmArm
raise AttributeError(f"module 'surfsense_evals.core.arms' has no attribute {name!r}")

View file

@ -0,0 +1,100 @@
"""Bare-LLM arm: chat completion with prompt-only input, no retrieval.
Pairs with ``SurfSenseArm`` for any benchmark that wants to measure
"how much does the model already know without RAG?". For factuality /
multi-hop benchmarks (FRAMES, MuSiQue, ) this produces the published
"naive prompting" baseline e.g. FRAMES's 40.8% on Gemini-Pro-1.5.
Symmetric with ``NativePdfArm`` in shape, but the request carries no
``pdf_paths``: the prompt itself is the only input the model gets.
"""
from __future__ import annotations
import logging
from ..providers.openrouter_chat import OpenRouterChatProvider
from .base import Arm, ArmRequest, ArmResult
logger = logging.getLogger(__name__)
class BareLlmArm(Arm):
"""``Arm`` implementation backed by ``OpenRouterChatProvider``.
``name`` defaults to ``"bare_llm"`` but is overridable per-instance.
Suites that want two distinct OpenRouter chat arms (e.g. CRAG's
``bare_llm`` vs ``long_context`` both backed by chat-completions
but exercising different prompt strategies) instantiate twice with
different names so the metrics aggregator can keep them separate.
"""
name: str = "bare_llm"
def __init__(
self,
*,
provider: OpenRouterChatProvider,
max_output_tokens: int | None = 1024,
system_prompt: str | None = None,
name: str | None = None,
) -> None:
self._provider = provider
self._max_output = max_output_tokens
self._system_prompt = system_prompt
if name:
self.name = name
@classmethod
def from_env(
cls,
*,
api_key: str,
model: str,
base_url: str = "https://openrouter.ai/api/v1",
max_output_tokens: int | None = 1024,
system_prompt: str | None = None,
name: str | None = None,
) -> BareLlmArm:
provider = OpenRouterChatProvider(
api_key=api_key,
base_url=base_url,
model=model,
)
return cls(
provider=provider,
max_output_tokens=max_output_tokens,
system_prompt=system_prompt,
name=name,
)
async def answer(self, request: ArmRequest) -> ArmResult:
try:
response = await self._provider.complete(
prompt=request.prompt,
system_prompt=self._system_prompt,
max_tokens=self._max_output,
)
except Exception as exc: # noqa: BLE001
return ArmResult(
arm=self.name,
question_id=request.question_id,
raw_text="",
error=f"{type(exc).__name__}: {exc}",
)
return ArmResult(
arm=self.name,
question_id=request.question_id,
raw_text=response.text,
input_tokens=response.input_tokens,
output_tokens=response.output_tokens,
cost_micros=response.cost_micros,
latency_ms=response.latency_ms,
extra={
"model": self._provider.model,
"finish_reason": response.finish_reason,
},
)
__all__ = ["BareLlmArm"]

View file

@ -0,0 +1,93 @@
"""Arm protocol + the value types every arm exchanges with a runner.
An ``Arm`` is "one way to answer one question". Two ship in this PR:
* ``NativePdfArm`` drop the PDF straight into an OpenRouter
chat-completions request with ``plugins=[{file-parser, engine:
native}]``. Used for the head-to-head "is the model good enough on
its own?" measurement.
* ``SurfSenseArm`` POST ``/api/v1/new_chat`` with the question
scoped to the relevant ``mentioned_document_ids``; consume the SSE
stream and parse citations.
Both implement the same protocol so a benchmark runner only sees
``Arm.answer(request) -> ArmResult``.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Protocol
@dataclass
class ArmRequest:
"""One arm-call worth of input.
* ``question_id`` is opaque used for logging and joining results.
* ``prompt`` is the fully-formatted text the arm should send. The
runner is responsible for prompt construction so head-to-head
comparisons use byte-identical text.
* ``pdf_paths`` is the per-question source PDFs (used by
``NativePdfArm``). Empty for retrieval-only / corpus-wide
benchmarks.
* ``mentioned_document_ids`` is the SurfSense document scoping list
(used by ``SurfSenseArm``). When ``None`` SurfSense retrieves
across the whole search space.
* ``options`` is a free-form bag of arm-specific overrides
(e.g. SurfSense's ``disabled_tools``).
"""
question_id: str
prompt: str
pdf_paths: list[Path] = field(default_factory=list)
mentioned_document_ids: list[int] | None = None
options: dict[str, Any] = field(default_factory=dict)
@dataclass
class ArmResult:
"""Outcome of one ``Arm.answer`` invocation."""
arm: str
question_id: str
raw_text: str
answer_letter: str | None = None
citations: list[dict[str, Any]] = field(default_factory=list)
input_tokens: int = 0
output_tokens: int = 0
cost_micros: int = 0
latency_ms: int = 0
error: str | None = None
extra: dict[str, Any] = field(default_factory=dict)
@property
def ok(self) -> bool:
return self.error is None
def to_jsonl(self) -> dict[str, Any]:
"""Stable dict shape for ``data/<suite>/runs/<ts>/<bench>_raw.jsonl``."""
return {
"arm": self.arm,
"question_id": self.question_id,
"answer_letter": self.answer_letter,
"raw_text": self.raw_text,
"citations": self.citations,
"input_tokens": self.input_tokens,
"output_tokens": self.output_tokens,
"cost_micros": self.cost_micros,
"latency_ms": self.latency_ms,
"error": self.error,
"extra": self.extra,
}
class Arm(Protocol):
"""One concrete way to answer questions for a given run."""
name: str
async def answer(self, request: ArmRequest) -> ArmResult: # pragma: no cover - protocol
...

View file

@ -0,0 +1,104 @@
"""Native-PDF arm: drop the PDF straight into OpenRouter chat-completions.
Generic across suites a benchmark just supplies the prompt and the
single PDF path. Multi-PDF questions concatenate in the runner before
calling this arm so each ``answer`` invocation feeds the model exactly
one ``data:application/pdf;base64,...`` block (matches the human
"drag-and-drop one PDF into Claude" intent).
"""
from __future__ import annotations
import logging
from ..parse.answer_letter import extract_answer_letter
from ..providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
from .base import Arm, ArmRequest, ArmResult
logger = logging.getLogger(__name__)
class NativePdfArm(Arm):
"""``Arm`` implementation backed by ``OpenRouterPdfProvider``."""
name: str = "native_pdf"
def __init__(
self,
*,
provider: OpenRouterPdfProvider,
max_output_tokens: int | None = 1024,
) -> None:
self._provider = provider
self._max_output = max_output_tokens
@classmethod
def from_env(
cls,
*,
api_key: str,
model: str,
engine: PdfEngine = PdfEngine.NATIVE,
base_url: str = "https://openrouter.ai/api/v1",
max_output_tokens: int | None = 1024,
) -> NativePdfArm:
provider = OpenRouterPdfProvider(
api_key=api_key,
base_url=base_url,
model=model,
engine=engine,
)
return cls(provider=provider, max_output_tokens=max_output_tokens)
async def answer(self, request: ArmRequest) -> ArmResult:
if not request.pdf_paths:
return ArmResult(
arm=self.name,
question_id=request.question_id,
raw_text="",
error="native_pdf arm requires at least one pdf_path",
)
if len(request.pdf_paths) > 1:
# The plan calls out one-PDF-per-question so the head-to-head
# is fair; runners are responsible for upstream concatenation.
logger.debug(
"qid=%s native_pdf got %d pdfs; using first only",
request.question_id,
len(request.pdf_paths),
)
pdf = request.pdf_paths[0]
try:
response = await self._provider.complete(
prompt=request.prompt,
pdf_path=pdf,
max_tokens=self._max_output,
)
except Exception as exc: # noqa: BLE001
return ArmResult(
arm=self.name,
question_id=request.question_id,
raw_text="",
error=f"{type(exc).__name__}: {exc}",
)
letter = extract_answer_letter(response.text)
return ArmResult(
arm=self.name,
question_id=request.question_id,
raw_text=response.text,
answer_letter=letter.letter,
input_tokens=response.input_tokens,
output_tokens=response.output_tokens,
cost_micros=response.cost_micros,
latency_ms=response.latency_ms,
extra={
"model": self._provider.model,
"engine": self._provider.engine.value,
"answer_letter_strategy": letter.strategy,
"finish_reason": response.finish_reason,
"pdf_filename": pdf.name,
},
)
__all__ = ["NativePdfArm"]

View file

@ -0,0 +1,104 @@
"""SurfSense arm: per-question fresh thread + ``/api/v1/new_chat`` stream.
For every question:
* Create a fresh ``NewChatThread`` on the suite's pinned SearchSpace.
This sidesteps the per-thread ``THREAD_BUSY`` 409 (a single thread
serialises turns, see ``surfsense_backend/app/routes/new_chat_routes.py:191-220``).
* POST ``/api/v1/new_chat`` with the prompt and the per-question
``mentioned_document_ids`` (``surfsense_backend/app/schemas/new_chat.py:241-243``).
* Consume the SSE stream via ``NewChatClient.ask`` which accumulates
text deltas and returns ``StreamedAnswer``.
* Optionally delete the thread (default ON for ephemeral runs).
Citations are parsed from the streamed assistant text via the
canonical regex port; chunk ids are returned in ``ArmResult.citations``
for the runner to map back to corpus ids.
"""
from __future__ import annotations
import logging
from ..clients import NewChatClient
from ..parse.answer_letter import extract_answer_letter
from .base import Arm, ArmRequest, ArmResult
logger = logging.getLogger(__name__)
class SurfSenseArm(Arm):
"""``Arm`` implementation backed by ``NewChatClient``."""
name: str = "surfsense"
def __init__(
self,
*,
client: NewChatClient,
search_space_id: int,
ephemeral_threads: bool = True,
thread_title_prefix: str = "eval",
) -> None:
self._client = client
self._search_space_id = search_space_id
self._ephemeral = ephemeral_threads
self._title_prefix = thread_title_prefix
async def answer(self, request: ArmRequest) -> ArmResult:
thread_id: int | None = None
try:
thread_id = await self._client.create_thread(
search_space_id=self._search_space_id,
title=f"{self._title_prefix}:{request.question_id}",
)
answer = await self._client.ask(
thread_id=thread_id,
search_space_id=self._search_space_id,
user_query=request.prompt,
mentioned_document_ids=request.mentioned_document_ids,
disabled_tools=request.options.get("disabled_tools"),
)
except Exception as exc: # noqa: BLE001
return ArmResult(
arm=self.name,
question_id=request.question_id,
raw_text="",
error=f"{type(exc).__name__}: {exc}",
extra={"thread_id": thread_id},
)
finally:
if self._ephemeral and thread_id is not None:
try:
await self._client.delete_thread(thread_id)
except Exception as exc: # noqa: BLE001
logger.debug(
"Failed to delete thread %s: %s", thread_id, exc
)
letter = extract_answer_letter(answer.text)
return ArmResult(
arm=self.name,
question_id=request.question_id,
raw_text=answer.text,
answer_letter=letter.letter,
citations=answer.citations,
latency_ms=answer.latency_ms,
# SurfSense doesn't surface input/output token counts in the
# SSE stream today; leaving the cost / token fields at 0
# documents that gap. Estimating from the raw text would
# bias the comparison against the SurfSense arm.
extra={
"thread_id": thread_id,
"search_space_id": self._search_space_id,
"answer_letter_strategy": letter.strategy,
"user_message_id": answer.user_message_id,
"assistant_message_id": answer.assistant_message_id,
"finished_normally": answer.finished_normally,
"n_raw_events": len(answer.raw_events),
"n_mentioned_documents": len(request.mentioned_document_ids or []),
},
)
__all__ = ["SurfSenseArm"]

View file

@ -0,0 +1,273 @@
"""Dual-mode credential resolver + httpx client factory with 401 auto-refresh.
SurfSense supports ``AUTH_TYPE=LOCAL`` (email + password) and
``AUTH_TYPE=GOOGLE`` (Google OAuth frontend stores JWT in ``localStorage``).
There is no headless equivalent of the Google flow, so the harness handles
both modes by treating the JWT as the universal credential:
* **LOCAL**: harness POSTs form-encoded ``username`` + ``password`` to
``/auth/jwt/login``, reads ``{access_token, refresh_token}``.
* **GOOGLE / pre-issued JWT**: operator pastes their existing JWT (and
optionally refresh token) into ``SURFSENSE_JWT`` /
``SURFSENSE_REFRESH_TOKEN``; harness skips login.
Either way ``client_with_auth`` returns one shared
``httpx.AsyncClient`` with ``Authorization: Bearer <jwt>`` set and an
event hook that, on a 401 with a refresh token in scope, calls
``POST /auth/jwt/refresh`` and retries the original request once. JWT
lifetime defaults to one day backend-side, so this matters for long
MIRAGE runs.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any
import httpx
from .config import Config
logger = logging.getLogger(__name__)
class CredentialError(RuntimeError):
"""Raised when no credential mode is configured."""
_NO_CREDENTIALS_MESSAGE = (
"No SurfSense credentials configured. Set ONE of:\n"
" (LOCAL) SURFSENSE_USER_EMAIL + SURFSENSE_USER_PASSWORD\n"
" (GOOGLE) SURFSENSE_JWT (and optionally SURFSENSE_REFRESH_TOKEN)\n"
"For GOOGLE: log in to SurfSense in your browser, open DevTools → "
"Application → Local Storage → copy `surfsense_bearer_token` and "
"`surfsense_refresh_token` into those env vars."
)
@dataclass
class TokenBundle:
"""Mutable token state — refresh hook updates ``access_token`` in place."""
access_token: str
refresh_token: str | None = None
# ``mode`` is informational only ("local" or "jwt"); used in error messages.
mode: str = "jwt"
# ---------------------------------------------------------------------------
# Token acquisition
# ---------------------------------------------------------------------------
async def acquire_token(config: Config, *, http: httpx.AsyncClient | None = None) -> TokenBundle:
"""Resolve credentials → ``TokenBundle``.
Precedence:
1. ``SURFSENSE_JWT`` set use it directly. Refresh token captured if
supplied.
2. ``SURFSENSE_USER_EMAIL`` + ``SURFSENSE_USER_PASSWORD`` set
form-encoded POST to ``/auth/jwt/login``.
3. Neither raise ``CredentialError``.
The optional ``http`` argument lets tests inject a mocked client; if
omitted a one-shot client is created for the login call only.
"""
if config.has_jwt_mode():
return TokenBundle(
access_token=config.surfsense_jwt or "",
refresh_token=config.surfsense_refresh_token,
mode="jwt",
)
if config.has_local_mode():
async def _login(client: httpx.AsyncClient) -> TokenBundle:
response = await client.post(
f"{config.surfsense_api_base}/auth/jwt/login",
data={
"username": config.surfsense_user_email,
"password": config.surfsense_user_password,
},
headers={"Accept": "application/json"},
)
if response.status_code != 200:
raise CredentialError(
f"LOCAL login failed (HTTP {response.status_code}): "
f"{_safe_text(response)}"
)
payload = response.json()
access = payload.get("access_token")
if not access:
raise CredentialError(
f"LOCAL login response missing access_token: {payload!r}"
)
return TokenBundle(
access_token=access,
refresh_token=payload.get("refresh_token") or None,
mode="local",
)
if http is not None:
return await _login(http)
async with httpx.AsyncClient(timeout=httpx.Timeout(30.0, connect=10.0)) as client:
return await _login(client)
raise CredentialError(_NO_CREDENTIALS_MESSAGE)
def _safe_text(response: httpx.Response, *, limit: int = 200) -> str:
body = response.text or ""
if len(body) > limit:
return body[:limit] + ""
return body
# ---------------------------------------------------------------------------
# httpx client + 401 auto-refresh
# ---------------------------------------------------------------------------
class _AuthState:
"""Shared mutable holder closed over by the auth event hook.
Kept private so callers can't accidentally mutate the access token
out-of-band; ``client_with_auth`` returns the client directly.
"""
def __init__(self, config: Config, tokens: TokenBundle) -> None:
self.config = config
self.tokens = tokens
self._refresh_in_flight: bool = False
def _build_auth_request(state: _AuthState, request: httpx.Request) -> None:
"""Stamp the current bearer onto ``request`` (request-event hook)."""
request.headers["Authorization"] = f"Bearer {state.tokens.access_token}"
async def _refresh_access_token(
state: _AuthState, transport: httpx.AsyncBaseTransport | None = None
) -> bool:
"""POST ``/auth/jwt/refresh`` with the current refresh token.
Returns ``True`` on success and updates ``state.tokens`` in place.
Returns ``False`` if no refresh token is configured or the call fails.
Recursive 401s are avoided by using a *new* client without the auth
hook.
"""
refresh = state.tokens.refresh_token
if not refresh:
return False
try:
async with httpx.AsyncClient(
timeout=httpx.Timeout(15.0, connect=5.0),
transport=transport,
) as inner:
response = await inner.post(
f"{state.config.surfsense_api_base}/auth/jwt/refresh",
json={"refresh_token": refresh},
headers={"Accept": "application/json"},
)
except httpx.HTTPError as exc:
logger.warning("Token refresh transport error: %s", exc)
return False
if response.status_code != 200:
logger.warning(
"Token refresh rejected (HTTP %s): %s",
response.status_code,
_safe_text(response),
)
return False
payload = response.json()
new_access = payload.get("access_token")
if not new_access:
logger.warning("Refresh response missing access_token: %r", payload)
return False
state.tokens.access_token = new_access
new_refresh = payload.get("refresh_token")
if new_refresh:
state.tokens.refresh_token = new_refresh
return True
def client_with_auth(
config: Config,
tokens: TokenBundle,
*,
timeout: float = 60.0,
transport: httpx.AsyncBaseTransport | None = None,
base_url: str | None = None,
) -> httpx.AsyncClient:
"""Build a single shared ``httpx.AsyncClient`` for the SurfSense API.
* Stamps ``Authorization: Bearer <jwt>`` on every outgoing request.
* On any 401 response, attempts a single refresh (if a refresh token
is configured) and retries the original request once. The retry
uses a fresh stamping of the bearer header, so a successful
refresh transparently unblocks long runs.
* The retry is best-effort repeated 401s after a refresh attempt
are surfaced to the caller so they can re-auth manually.
Pass ``base_url`` to scope a sub-client (e.g. tests). The default
keeps full URLs in calling code, which makes route-spec citations in
the codebase easier to grep.
"""
state = _AuthState(config, tokens)
async def _request_hook(request: httpx.Request) -> None:
_build_auth_request(state, request)
# ``send`` is overridden in ``_AuthAwareClient`` to retry once on 401
# after refreshing the bearer. httpx's response event-hook can't
# *replace* a response, so we need a subclass to do the replay.
client = _AuthAwareClient(
state=state,
transport=transport,
timeout=httpx.Timeout(timeout, connect=10.0),
base_url=base_url or "",
event_hooks={"request": [_request_hook]},
)
return client
class _AuthAwareClient(httpx.AsyncClient):
"""``AsyncClient`` that retries once on 401 after refreshing the token."""
def __init__(self, *, state: _AuthState, **kwargs: Any) -> None:
super().__init__(**kwargs)
self._auth_state = state
async def send( # type: ignore[override]
self, request: httpx.Request, **kwargs: Any
) -> httpx.Response:
response = await super().send(request, **kwargs)
if response.status_code != 401:
return response
# Don't refresh while a refresh is itself in flight.
if self._auth_state._refresh_in_flight:
return response
self._auth_state._refresh_in_flight = True
try:
refreshed = await _refresh_access_token(self._auth_state)
finally:
self._auth_state._refresh_in_flight = False
if not refreshed:
return response
# Re-stamp and replay once. ``request`` is reusable.
await response.aclose()
request.headers["Authorization"] = f"Bearer {self._auth_state.tokens.access_token}"
return await super().send(request, **kwargs)
__all__ = [
"CredentialError",
"TokenBundle",
"acquire_token",
"client_with_auth",
]

View file

@ -0,0 +1,790 @@
"""Argparse CLI for ``python -m surfsense_evals``.
Subcommands:
* ``setup --suite <name> --provider-model <slug> [--agent-llm-id <int>]``
* ``teardown --suite <name>``
* ``models list [--provider openrouter] [--grep <s>]``
* ``suites list``
* ``benchmarks list [--suite <name>]``
* ``ingest <suite> <benchmark> [benchmark flags]``
* ``run <suite> <benchmark> [benchmark flags]``
* ``report --suite <name> [--benchmark <name>]``
The ``ingest`` / ``run`` subparsers are built dynamically from the
registry adding a new benchmark only requires registering it; the
CLI surface comes for free. ``add_run_args`` lets each benchmark
publish its own flags.
Design choices worth flagging:
* ``setup`` rejects ``agent_llm_id == 0`` (Auto / LiteLLM router) so
per-question accuracy is reproducible.
* ``setup`` validates that the picked LLM config has
``provider == "OPENROUTER"`` and ``model_name == --provider-model``
before declaring success both arms of the head-to-head must hit
the same OpenRouter slug.
* Lifecycle state is keyed by suite, so ``setup --suite legal`` does
not touch ``medical``'s SearchSpace, and vice versa.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import sys
from dataclasses import dataclass
from typing import Any
import sys
import httpx
from rich.console import Console
from rich.table import Table
# Windows' legacy console (cp1252) crashes when Rich tries to write characters
# outside the active codepage (e.g. '->', em-dashes, box-drawing). Force UTF-8
# on stdout/stderr and disable Rich's legacy_windows render path so the file
# stream is used directly. Modern Windows (>=10, VS Code terminal, Windows
# Terminal, PowerShell, cmd) all interpret ANSI escapes natively.
if sys.platform == "win32":
for _stream in (sys.stdout, sys.stderr):
try:
_stream.reconfigure(encoding="utf-8", errors="replace")
except (AttributeError, ValueError):
pass
from . import registry
from .auth import CredentialError, acquire_token, client_with_auth
from .clients import SearchSpaceClient
from .clients.search_space import LlmPreferences
from .config import (
DEFAULT_SCENARIO,
SCENARIOS,
Config,
SuiteState,
clear_suite_state,
get_suite_state,
load_config,
set_suite_state,
utc_iso_timestamp,
)
from .vision_llm import VisionConfigError, resolve_vision_llm
logger = logging.getLogger("surfsense_evals")
console = Console(legacy_windows=False)
# ---------------------------------------------------------------------------
# Discovery
# ---------------------------------------------------------------------------
def _discover_suites() -> list[str]:
"""Trigger ``register(...)`` for every benchmark.
Imported lazily so ``models list`` (which doesn't need any
benchmark) still runs fast.
"""
from surfsense_evals.suites import discover_suites
return discover_suites()
# ---------------------------------------------------------------------------
# Global LLM config fetcher (used by setup + models list)
# ---------------------------------------------------------------------------
@dataclass
class LlmConfigEntry:
id: int
name: str
provider: str
model_name: str
raw: dict[str, Any]
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> LlmConfigEntry:
return cls(
id=int(payload["id"]),
name=str(payload.get("name", "")),
provider=str(payload.get("provider", "")).upper(),
model_name=str(payload.get("model_name", "")),
raw=payload,
)
async def _list_global_llm_configs(http: httpx.AsyncClient, base: str) -> list[LlmConfigEntry]:
response = await http.get(
f"{base}/api/v1/global-new-llm-configs",
headers={"Accept": "application/json"},
)
response.raise_for_status()
payload = response.json()
if not isinstance(payload, list):
raise RuntimeError(f"Unexpected /global-new-llm-configs payload: {payload!r}")
return [LlmConfigEntry.from_payload(item) for item in payload]
def _resolve_openrouter_id(
candidates: list[LlmConfigEntry],
provider_model: str,
*,
explicit_id: int | None,
) -> int:
"""Resolve the SurfSense LLM id for ``provider_model``.
Behaviour:
* If ``explicit_id`` is given: return it directly. The caller is
then expected to GET-validate that the row's
``provider == "OPENROUTER"`` and ``model_name`` matches the slug.
That branch supports positive BYOK ``NewLLMConfig`` rows whose
slugs may overlap with global OpenRouter virtuals.
* Otherwise: filter to ``provider == "OPENROUTER"`` and
``model_name == provider_model``. Expect exactly one match
raise with a friendly message otherwise.
"""
if explicit_id is not None:
return explicit_id
matches = [
c for c in candidates if c.provider == "OPENROUTER" and c.model_name == provider_model
]
if not matches:
sample = ", ".join(
f"{c.model_name} (id={c.id})" for c in candidates if c.provider == "OPENROUTER"
)[:600]
raise RuntimeError(
f"No OpenRouter config found for slug '{provider_model}'. "
"Make sure `openrouter_integration.enabled: true` in "
"global_llm_config.yaml and that the Celery worker has "
"finished its first refresh (the catalogue is fetched at "
"Celery startup per `app/celery_app.py`). "
f"Available OpenRouter slugs (sample): {sample or '<none>'}.\n"
"Browse with: python -m surfsense_evals models list --grep <substring>"
)
if len(matches) > 1:
listing = "\n".join(f" id={c.id} name={c.name!r}" for c in matches)
raise RuntimeError(
f"Multiple OpenRouter configs for slug '{provider_model}':\n{listing}\n"
"Pass --agent-llm-id <id> to disambiguate."
)
return matches[0].id
# ---------------------------------------------------------------------------
# Subcommand implementations
# ---------------------------------------------------------------------------
async def _cmd_setup(args: argparse.Namespace) -> int:
suite = args.suite
provider_model: str = args.provider_model
explicit_id: int | None = args.agent_llm_id
scenario: str = args.scenario
vision_llm_slug: str | None = args.vision_llm
native_arm_model: str | None = args.native_arm_model
skip_vision_setup: bool = args.no_vision_llm_setup
if explicit_id == 0:
console.print(
"[red]agent_llm_id == 0 (Auto / LiteLLM router) is not allowed — "
"results would not be reproducible.[/red]"
)
return 2
if scenario not in SCENARIOS:
console.print(
f"[red]Unknown scenario {scenario!r}. Pick one of: "
f"{', '.join(SCENARIOS)}[/red]"
)
return 2
# Scenario-specific validation. Each branch documents WHY the rule
# exists so the operator's mental model matches what the runner does.
if scenario == "cost-arbitrage":
if not native_arm_model:
console.print(
"[red]--scenario cost-arbitrage requires --native-arm-model "
"<vision-capable slug>.[/red] The native arm needs a vision "
"model to fairly answer image-bearing questions; SurfSense "
"answers from already-extracted text via --provider-model."
)
return 2
if native_arm_model == provider_model:
console.print(
"[yellow]--native-arm-model equals --provider-model in "
"cost-arbitrage; that's degenerate (same as head-to-head). "
"Pick a different slug or switch to --scenario head-to-head.[/yellow]"
)
elif scenario in ("head-to-head", "symmetric-cheap"):
if native_arm_model:
console.print(
f"[yellow]--native-arm-model is ignored for --scenario {scenario} "
f"(both arms answer with --provider-model={provider_model!r}).[/yellow]"
)
native_arm_model = None # don't persist a stale value
config = load_config()
try:
token = await acquire_token(config)
except CredentialError as exc:
console.print(f"[red]{exc}[/red]")
return 2
async with client_with_auth(config, token) as http:
candidates = await _list_global_llm_configs(http, config.surfsense_api_base)
try:
agent_llm_id = _resolve_openrouter_id(
candidates, provider_model, explicit_id=explicit_id
)
except RuntimeError as exc:
console.print(f"[red]{exc}[/red]")
return 2
ss_client = SearchSpaceClient(http, config.surfsense_api_base)
existing = get_suite_state(config, suite)
if existing is not None:
try:
row = await ss_client.get(existing.search_space_id)
console.print(
f"Reusing existing SearchSpace [cyan]{row.name}[/cyan] "
f"(id={row.id}) for suite [bold]{suite}[/bold]."
)
search_space_id = row.id
except httpx.HTTPStatusError as exc:
if exc.response.status_code == 404:
console.print(
f"[yellow]state.json pointed at SearchSpace id={existing.search_space_id} "
f"but backend returned 404; creating a fresh one.[/yellow]"
)
existing = None
else:
raise
if existing is None:
ss_name = f"eval-{suite}-{utc_iso_timestamp()}"
row = await ss_client.create(
ss_name, description=f"surfsense-evals lifecycle ({suite})"
)
console.print(
f"Created SearchSpace [cyan]{row.name}[/cyan] (id={row.id}) "
f"for suite [bold]{suite}[/bold]."
)
search_space_id = row.id
# Resolve + attach the vision LLM config (unless explicitly skipped).
# Asymmetric scenarios make the vision LLM at ingest a hard
# requirement — without it, SurfSense's chunks have no image
# content and the entire framing collapses.
vision_required = scenario in ("symmetric-cheap", "cost-arbitrage")
vision_config_id: int | None = None
vision_provider_model: str | None = None
if not skip_vision_setup and (vision_required or vision_llm_slug is not None):
try:
vision_candidates = await ss_client.list_global_vision_llm_configs()
resolved = resolve_vision_llm(
vision_candidates, explicit_slug=vision_llm_slug
)
except VisionConfigError as exc:
console.print(f"[red]{exc}[/red]")
return 2
vision_config_id = resolved.config_id
vision_provider_model = resolved.provider_model
console.print(
f"Vision LLM at ingest: [cyan]{vision_provider_model}[/cyan] "
f"(id={vision_config_id}, selected_via={resolved.selected_via})."
)
pref_kwargs: dict[str, Any] = {"agent_llm_id": agent_llm_id}
if vision_config_id is not None:
pref_kwargs["vision_llm_config_id"] = vision_config_id
await ss_client.set_llm_preferences(search_space_id, **pref_kwargs)
prefs = await ss_client.get_llm_preferences(search_space_id)
if not _validate_pin(prefs, provider_model):
agent = prefs.agent_llm or {}
console.print(
f"[red]LLM pin validation FAILED.[/red] After PUT, "
f"agent_llm.provider={agent.get('provider')!r}, "
f"model_name={agent.get('model_name')!r}; expected "
f"provider=OPENROUTER, model_name={provider_model!r}."
)
return 2
if vision_config_id is not None and prefs.vision_llm_config_id != vision_config_id:
console.print(
f"[red]Vision LLM pin validation FAILED.[/red] After PUT, "
f"vision_llm_config_id={prefs.vision_llm_config_id!r}; "
f"expected {vision_config_id!r}."
)
return 2
suite_state = SuiteState(
search_space_id=search_space_id,
agent_llm_id=agent_llm_id,
provider_model=provider_model,
created_at=utc_iso_timestamp(),
ingestion_maps=existing.ingestion_maps if existing else {},
scenario=scenario,
vision_llm_config_id=vision_config_id,
vision_provider_model=vision_provider_model,
native_arm_model=native_arm_model,
)
set_suite_state(config, suite, suite_state)
summary_bits = [
f"suite={suite!r}",
f"scenario={scenario!r}",
f"search_space_id={suite_state.search_space_id}",
f"agent_llm_id={suite_state.agent_llm_id}",
f"provider_model={suite_state.provider_model!r}",
]
if suite_state.vision_provider_model:
summary_bits.append(f"vision_provider_model={suite_state.vision_provider_model!r}")
if suite_state.native_arm_model:
summary_bits.append(f"native_arm_model={suite_state.native_arm_model!r}")
console.print(f"[green]setup OK[/green] {' '.join(summary_bits)}")
return 0
def _validate_pin(prefs: LlmPreferences, provider_model: str) -> bool:
agent = prefs.agent_llm or {}
return (
str(agent.get("provider", "")).upper() == "OPENROUTER"
and str(agent.get("model_name", "")) == provider_model
)
async def _cmd_teardown(args: argparse.Namespace) -> int:
suite = args.suite
config = load_config()
state = get_suite_state(config, suite)
if state is None:
console.print(f"[yellow]No state for suite {suite!r}; nothing to tear down.[/yellow]")
return 0
try:
token = await acquire_token(config)
except CredentialError as exc:
console.print(f"[red]{exc}[/red]")
return 2
async with client_with_auth(config, token) as http:
ss_client = SearchSpaceClient(http, config.surfsense_api_base)
try:
await ss_client.delete(state.search_space_id)
except httpx.HTTPStatusError as exc:
console.print(
f"[yellow]DELETE failed (HTTP {exc.response.status_code}); "
"clearing state.json anyway.[/yellow]"
)
clear_suite_state(config, suite)
console.print(
f"[green]teardown OK[/green] suite={suite!r} "
f"(SearchSpace soft-deleted, state.json slot cleared)."
)
return 0
async def _cmd_models_list(args: argparse.Namespace) -> int:
config = load_config()
try:
token = await acquire_token(config)
except CredentialError as exc:
console.print(f"[red]{exc}[/red]")
return 2
async with client_with_auth(config, token) as http:
entries = await _list_global_llm_configs(http, config.surfsense_api_base)
grep = (args.grep or "").lower()
provider_filter = (args.provider or "").upper()
rows: list[LlmConfigEntry] = []
for e in entries:
if provider_filter and e.provider != provider_filter:
continue
if grep and grep not in e.model_name.lower() and grep not in e.name.lower():
continue
rows.append(e)
table = Table(
title=f"Global LLM configs ({len(rows)} of {len(entries)})",
show_lines=False,
)
table.add_column("id", justify="right", style="cyan")
table.add_column("provider", style="magenta")
table.add_column("model_name", style="green")
table.add_column("name")
for e in sorted(rows, key=lambda x: (x.provider, x.model_name)):
table.add_row(str(e.id), e.provider, e.model_name, e.name)
console.print(table)
return 0
def _cmd_suites_list(_args: argparse.Namespace) -> int:
_discover_suites()
suites = registry.list_suites()
if not suites:
console.print(
"[yellow]No suites registered. Drop a benchmark under "
"src/surfsense_evals/suites/<domain>/<benchmark>/.[/yellow]"
)
return 0
table = Table(title=f"Registered suites ({len(suites)})")
table.add_column("suite", style="bold")
table.add_column("benchmarks", style="green")
for suite in suites:
names = [b.name for b in registry.list_benchmarks(suite)]
table.add_row(suite, ", ".join(names) or "<none>")
console.print(table)
return 0
def _cmd_benchmarks_list(args: argparse.Namespace) -> int:
_discover_suites()
benchmarks = registry.list_benchmarks(args.suite)
if not benchmarks:
console.print("[yellow]No benchmarks registered.[/yellow]")
return 0
table = Table(title=f"Benchmarks ({len(benchmarks)})")
table.add_column("suite", style="bold")
table.add_column("name", style="cyan")
table.add_column("headline", justify="center")
table.add_column("description")
for b in benchmarks:
table.add_row(
b.suite,
b.name,
"yes" if b.headline else "no",
getattr(b, "description", ""),
)
console.print(table)
return 0
async def _cmd_ingest(args: argparse.Namespace) -> int:
benchmark = registry.get(args.suite, args.benchmark)
config = load_config()
state = get_suite_state(config, args.suite)
if state is None:
console.print(
f"[red]No setup for suite {args.suite!r}. Run "
f"`python -m surfsense_evals setup --suite {args.suite} "
f"--provider-model <slug>` first.[/red]"
)
return 2
try:
token = await acquire_token(config)
except CredentialError as exc:
console.print(f"[red]{exc}[/red]")
return 2
# Forward parsed CLI flags into ingest() so a benchmark can honour
# its own flags (e.g. MIRAGE's --skip-snippet-filter / --corpus).
extra_kwargs = {
k: v
for k, v in vars(args).items()
if k not in {"_func", "_async", "command", "subcommand", "suite", "benchmark", "log_level"}
}
async with client_with_auth(config, token) as http:
ctx = registry.RunContext(
suite=args.suite,
benchmark=args.benchmark,
config=config,
suite_state=state,
http=http,
)
await benchmark.ingest(ctx, **extra_kwargs)
console.print(f"[green]ingest OK[/green] {args.suite}/{args.benchmark}")
return 0
async def _cmd_run(args: argparse.Namespace) -> int:
benchmark = registry.get(args.suite, args.benchmark)
config = load_config()
state = get_suite_state(config, args.suite)
if state is None:
console.print(
f"[red]No setup for suite {args.suite!r}. Run "
f"`python -m surfsense_evals setup --suite {args.suite} "
f"--provider-model <slug>` first.[/red]"
)
return 2
try:
token = await acquire_token(config)
except CredentialError as exc:
console.print(f"[red]{exc}[/red]")
return 2
extra_kwargs = {
k: v
for k, v in vars(args).items()
if k not in {"_func", "_async", "command", "subcommand", "suite", "benchmark", "log_level"}
}
async with client_with_auth(config, token) as http:
ctx = registry.RunContext(
suite=args.suite,
benchmark=args.benchmark,
config=config,
suite_state=state,
http=http,
)
artifact = await benchmark.run(ctx, **extra_kwargs)
console.print(
f"[green]run OK[/green] {args.suite}/{args.benchmark}"
f"{artifact.raw_path}"
)
return 0
async def _cmd_report(args: argparse.Namespace) -> int:
from .report import write_report
benchmark_filter = args.benchmark
config = load_config()
state = get_suite_state(config, args.suite)
if state is None:
console.print(f"[red]No setup for suite {args.suite!r}.[/red]")
return 2
benchmarks = registry.list_benchmarks(args.suite)
if benchmark_filter:
benchmarks = [b for b in benchmarks if b.name == benchmark_filter]
if not benchmarks:
console.print(
f"[red]No registered benchmark named {benchmark_filter!r} in suite {args.suite!r}.[/red]"
)
return 2
artifacts = _collect_artifacts(config, args.suite, [b.name for b in benchmarks])
if not artifacts:
console.print(
"[yellow]No run artifacts found under "
f"{config.suite_runs_dir(args.suite)}. Run a benchmark first.[/yellow]"
)
return 1
grouped: dict[str, list[registry.RunArtifact]] = {}
for art in artifacts:
grouped.setdefault(art.benchmark, []).append(art)
sections: list[registry.ReportSection] = []
for benchmark in benchmarks:
if benchmark.name not in grouped:
continue
sections.append(benchmark.report_section(grouped[benchmark.name]))
summary_path = write_report(
config=config,
suite=args.suite,
sections=sections,
run_timestamp=utc_iso_timestamp(),
)
console.print(f"[green]report OK[/green] → {summary_path}")
return 0
def _collect_artifacts(
config: Config, suite: str, benchmark_names: list[str]
) -> list[registry.RunArtifact]:
"""Walk ``data/<suite>/runs/*/<benchmark>/`` for the latest artifacts.
Reads any ``run_artifact.json`` written by a benchmark runner. The
runner is responsible for writing this manifest alongside its raw
JSONL so the report writer doesn't have to know benchmark-specific
metric shapes.
"""
runs_dir = config.suite_runs_dir(suite)
if not runs_dir.exists():
return []
artifacts: list[registry.RunArtifact] = []
by_bench: dict[str, registry.RunArtifact] = {}
for ts_dir in sorted(runs_dir.iterdir()):
if not ts_dir.is_dir():
continue
for bench_name in benchmark_names:
bench_dir = ts_dir / bench_name
manifest = bench_dir / "run_artifact.json"
if not manifest.exists():
continue
try:
with manifest.open("r", encoding="utf-8") as fh:
payload = json.load(fh)
except (OSError, json.JSONDecodeError):
continue
artifact = registry.RunArtifact(
suite=suite,
benchmark=bench_name,
run_timestamp=ts_dir.name,
raw_path=bench_dir / payload.get("raw_path", "raw.jsonl"),
metrics=payload.get("metrics", {}),
extra=payload.get("extra", {}),
)
# Latest run wins per benchmark.
by_bench[bench_name] = artifact
artifacts = list(by_bench.values())
return artifacts
# ---------------------------------------------------------------------------
# Argparse wiring
# ---------------------------------------------------------------------------
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="surfsense-evals",
description="SurfSense evaluation harness — domain-agnostic core + pluggable suites.",
)
parser.add_argument(
"--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"]
)
sub = parser.add_subparsers(dest="command", required=True)
p_setup = sub.add_parser("setup", help="Create per-suite SearchSpace + pin LLM.")
p_setup.add_argument("--suite", required=True)
p_setup.add_argument(
"--provider-model",
required=True,
help=(
"OpenRouter slug for the SurfSense answer LLM (and the native arm "
"too unless --native-arm-model is set), e.g. "
"'anthropic/claude-sonnet-4.5'."
),
)
p_setup.add_argument(
"--agent-llm-id",
type=int,
default=None,
help="Optional override for BYOK NewLLMConfig rows.",
)
p_setup.add_argument(
"--scenario",
choices=SCENARIOS,
default=DEFAULT_SCENARIO,
help=(
"head-to-head (default): both arms answer with --provider-model; "
"symmetric-cheap: both arms use the same cheap text-only slug, "
"SurfSense pre-extracted images at ingest with a vision LLM; "
"cost-arbitrage: native arm uses --native-arm-model (vision), "
"SurfSense uses --provider-model (cheap, text-only) over chunks "
"the vision LLM already extracted at ingest."
),
)
p_setup.add_argument(
"--vision-llm",
default=None,
metavar="SLUG",
help=(
"OpenRouter slug for the vision LLM SurfSense uses at ingest "
"when --use-vision-llm is on. If omitted in symmetric-cheap / "
"cost-arbitrage, the strongest registered vision config is "
"auto-picked (priority: claude-sonnet-4.5 > claude-opus-4.7 > "
"gpt-5 > gemini-2.5-pro)."
),
)
p_setup.add_argument(
"--native-arm-model",
default=None,
metavar="SLUG",
help=(
"Required for --scenario cost-arbitrage. OpenRouter slug used "
"by the native_pdf arm only; SurfSense answers with "
"--provider-model. Ignored for head-to-head / symmetric-cheap."
),
)
p_setup.add_argument(
"--no-vision-llm-setup",
action="store_true",
help=(
"Skip attaching a vision LLM config to the SearchSpace even if "
"the scenario would normally require one. Use when you want to "
"keep whatever is already attached (e.g. a per-user config)."
),
)
p_setup.set_defaults(_func=_cmd_setup, _async=True)
p_teardown = sub.add_parser("teardown", help="Soft-delete the suite SearchSpace + clear state slot.")
p_teardown.add_argument("--suite", required=True)
p_teardown.set_defaults(_func=_cmd_teardown, _async=True)
p_models = sub.add_parser("models", help="LLM-config discovery helpers.")
models_sub = p_models.add_subparsers(dest="subcommand", required=True)
p_models_list = models_sub.add_parser("list", help="List global LLM configs.")
p_models_list.add_argument("--provider", default=None, help="Filter by provider, e.g. openrouter")
p_models_list.add_argument("--grep", default=None, help="Substring filter on name / model_name.")
p_models_list.set_defaults(_func=_cmd_models_list, _async=True)
p_suites = sub.add_parser("suites", help="List registered suites.")
suites_sub = p_suites.add_subparsers(dest="subcommand", required=True)
p_suites_list = suites_sub.add_parser("list", help="List suites.")
p_suites_list.set_defaults(_func=_cmd_suites_list, _async=False)
p_benchmarks = sub.add_parser("benchmarks", help="List registered benchmarks.")
bench_sub = p_benchmarks.add_subparsers(dest="subcommand", required=True)
p_bench_list = bench_sub.add_parser("list", help="List benchmarks.")
p_bench_list.add_argument("--suite", default=None)
p_bench_list.set_defaults(_func=_cmd_benchmarks_list, _async=False)
# Dynamic ingest / run subcommands need the registry populated, so
# discover up-front (cheap on import — modules just register).
_discover_suites()
p_ingest = sub.add_parser("ingest", help="Ingest a benchmark's corpus.")
ingest_sub = p_ingest.add_subparsers(dest="suite", required=True)
for suite in registry.list_suites():
suite_parser = ingest_sub.add_parser(suite, help=f"Ingest a {suite} benchmark.")
suite_bench = suite_parser.add_subparsers(dest="benchmark", required=True)
for benchmark in registry.list_benchmarks(suite):
bp = suite_bench.add_parser(benchmark.name, help=getattr(benchmark, "description", benchmark.name))
if hasattr(benchmark, "add_run_args"):
benchmark.add_run_args(bp)
bp.set_defaults(_func=_cmd_ingest, _async=True)
p_run = sub.add_parser("run", help="Run a benchmark.")
run_sub = p_run.add_subparsers(dest="suite", required=True)
for suite in registry.list_suites():
suite_parser = run_sub.add_parser(suite, help=f"Run a {suite} benchmark.")
suite_bench = suite_parser.add_subparsers(dest="benchmark", required=True)
for benchmark in registry.list_benchmarks(suite):
bp = suite_bench.add_parser(benchmark.name, help=getattr(benchmark, "description", benchmark.name))
if hasattr(benchmark, "add_run_args"):
benchmark.add_run_args(bp)
bp.set_defaults(_func=_cmd_run, _async=True)
p_report = sub.add_parser("report", help="Aggregate latest run artifacts into a summary.")
p_report.add_argument("--suite", required=True)
p_report.add_argument("--benchmark", default=None, help="Optional: report only this benchmark.")
p_report.set_defaults(_func=_cmd_report, _async=True)
return parser
def main(argv: list[str] | None = None) -> int:
parser = _build_parser()
args = parser.parse_args(argv)
logging.basicConfig(
level=getattr(logging, args.log_level),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
func = getattr(args, "_func", None)
if func is None:
parser.print_help()
return 2
is_async = getattr(args, "_async", False)
try:
if is_async:
return asyncio.run(func(args))
return func(args)
except KeyboardInterrupt:
console.print("[yellow]Interrupted.[/yellow]")
return 130
except Exception as exc: # noqa: BLE001
logger.exception("CLI command failed")
console.print(f"[red]Command failed: {exc}[/red]")
return 1
if __name__ == "__main__": # pragma: no cover
sys.exit(main())

View file

@ -0,0 +1,14 @@
"""HTTP clients for the SurfSense API. All share one ``httpx.AsyncClient``."""
from __future__ import annotations
from .documents import DocumentsClient
from .new_chat import NewChatClient, StreamedAnswer
from .search_space import SearchSpaceClient
__all__ = [
"DocumentsClient",
"NewChatClient",
"SearchSpaceClient",
"StreamedAnswer",
]

View file

@ -0,0 +1,277 @@
"""Client for ``/api/v1/documents/{fileupload,status,{id}/chunks}``.
Verified against:
* ``surfsense_backend/app/routes/documents_routes.py:122-292`` (POST fileupload)
* ``surfsense_backend/app/routes/documents_routes.py:806-871`` (GET status batch)
* ``surfsense_backend/app/routes/documents_routes.py:1062-1128`` (GET {id}/chunks paginated)
Document processing is asynchronous:
* ``POST /documents/fileupload`` returns immediately with
``document_ids`` in ``pending``;
* a Celery worker moves each through ``processing ready/failed``;
* the harness polls ``GET /documents/status?document_ids=...`` until
every doc is ``ready`` (otherwise the retriever sees an empty corpus
and accuracy numbers are meaningless).
"""
from __future__ import annotations
import asyncio
import logging
import mimetypes
from collections.abc import Iterable, Sequence
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import httpx
logger = logging.getLogger(__name__)
@dataclass
class FileUploadResult:
"""Mirrors the JSON returned by ``POST /documents/fileupload``."""
document_ids: list[int]
duplicate_document_ids: list[int]
total_files: int
pending_files: int
skipped_duplicates: int
message: str = ""
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> FileUploadResult:
return cls(
document_ids=[int(x) for x in payload.get("document_ids", [])],
duplicate_document_ids=[int(x) for x in payload.get("duplicate_document_ids", [])],
total_files=int(payload.get("total_files", 0)),
pending_files=int(payload.get("pending_files", 0)),
skipped_duplicates=int(payload.get("skipped_duplicates", 0)),
message=str(payload.get("message", "")),
)
@dataclass
class DocumentStatus:
document_id: int
title: str
document_type: str
state: str
reason: str | None = None
@property
def is_ready(self) -> bool:
return self.state == "ready"
@property
def is_failed(self) -> bool:
return self.state == "failed"
@dataclass
class ChunkRow:
id: int
document_id: int
content: str = ""
raw: dict[str, Any] = field(default_factory=dict)
class DocumentProcessingFailed(RuntimeError):
"""Raised when a polled document lands in ``failed``."""
def __init__(self, statuses: Sequence[DocumentStatus]) -> None:
details = ", ".join(
f"id={s.document_id} ({s.title!r}): {s.reason or 'unknown'}"
for s in statuses
)
super().__init__(f"Document(s) failed to process: {details}")
self.statuses = list(statuses)
class DocumentProcessingTimeout(RuntimeError):
"""Raised when polling exceeds the per-doc timeout budget."""
class DocumentsClient:
"""Document upload + status polling + chunk listing."""
def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
self._http = http
self._base = base_url.rstrip("/")
# ------------------------------------------------------------------
# upload
# ------------------------------------------------------------------
async def upload(
self,
files: Iterable[Path],
*,
search_space_id: int,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> FileUploadResult:
"""Upload files to ``/api/v1/documents/fileupload``.
``files`` is materialised to a list because we may need to
re-read on retry. Caller is responsible for ensuring each path
exists and respects the per-file size cap (50 MB backend default).
"""
materialised = [Path(p) for p in files]
if not materialised:
return FileUploadResult(
document_ids=[],
duplicate_document_ids=[],
total_files=0,
pending_files=0,
skipped_duplicates=0,
message="No files supplied",
)
opened: list[tuple[str, Any]] = []
try:
for path in materialised:
# ``open`` directly — httpx wraps it in MultipartStream.
file_obj = path.open("rb")
mime, _ = mimetypes.guess_type(path.name)
opened.append(
(
"files",
(path.name, file_obj, mime or "application/octet-stream"),
)
)
response = await self._http.post(
f"{self._base}/api/v1/documents/fileupload",
data={
"search_space_id": str(search_space_id),
"should_summarize": "true" if should_summarize else "false",
"use_vision_llm": "true" if use_vision_llm else "false",
"processing_mode": processing_mode,
},
files=opened,
# Multipart uploads can be slow for big PDFs; bump per-call.
timeout=httpx.Timeout(120.0, connect=10.0),
)
finally:
for _, (_, file_obj, _) in opened:
try:
file_obj.close()
except Exception: # noqa: BLE001
pass
response.raise_for_status()
return FileUploadResult.from_payload(response.json())
# ------------------------------------------------------------------
# status polling
# ------------------------------------------------------------------
async def get_status(
self, *, search_space_id: int, document_ids: Sequence[int]
) -> list[DocumentStatus]:
if not document_ids:
return []
response = await self._http.get(
f"{self._base}/api/v1/documents/status",
params={
"search_space_id": search_space_id,
"document_ids": ",".join(str(d) for d in document_ids),
},
headers={"Accept": "application/json"},
)
response.raise_for_status()
payload = response.json()
return [
DocumentStatus(
document_id=int(item["id"]),
title=str(item.get("title", "")),
document_type=str(item.get("document_type", "")),
state=str((item.get("status") or {}).get("state", "ready")),
reason=(item.get("status") or {}).get("reason"),
)
for item in payload.get("items", [])
]
async def wait_until_ready(
self,
*,
search_space_id: int,
document_ids: Sequence[int],
timeout_s: float = 300.0,
initial_poll_s: float = 1.0,
max_poll_s: float = 10.0,
) -> list[DocumentStatus]:
"""Poll ``GET /documents/status`` until every doc is ``ready``.
Exponential backoff from ``initial_poll_s`` up to ``max_poll_s``.
Raises ``DocumentProcessingFailed`` if any doc lands in
``failed`` (with the offending document ids), or
``DocumentProcessingTimeout`` if the budget is exhausted.
"""
if not document_ids:
return []
deadline = asyncio.get_event_loop().time() + timeout_s
poll = initial_poll_s
while True:
statuses = await self.get_status(
search_space_id=search_space_id, document_ids=document_ids
)
failed = [s for s in statuses if s.is_failed]
if failed:
raise DocumentProcessingFailed(failed)
ready = [s for s in statuses if s.is_ready]
if len(ready) == len(document_ids):
return statuses
now = asyncio.get_event_loop().time()
if now >= deadline:
pending = [s for s in statuses if not s.is_ready and not s.is_failed]
pending_ids = [s.document_id for s in pending]
raise DocumentProcessingTimeout(
f"Timed out after {timeout_s:.0f}s waiting for documents "
f"(still pending/processing: {pending_ids})"
)
await asyncio.sleep(min(poll, max(0.1, deadline - now)))
poll = min(poll * 1.5, max_poll_s)
# ------------------------------------------------------------------
# chunks (chunk_id -> document_id map)
# ------------------------------------------------------------------
async def list_chunks(
self, document_id: int, *, page_size: int = 100
) -> list[ChunkRow]:
"""Walk ``GET /documents/{id}/chunks`` until ``has_more=False``.
Used by ingestion to materialise the ``chunk_id -> document_id``
map needed for retrieval scoring (CUREv1).
"""
rows: list[ChunkRow] = []
page = 0
while True:
response = await self._http.get(
f"{self._base}/api/v1/documents/{document_id}/chunks",
params={"page": page, "page_size": page_size},
headers={"Accept": "application/json"},
)
response.raise_for_status()
payload = response.json()
for item in payload.get("items", []):
rows.append(
ChunkRow(
id=int(item["id"]),
document_id=document_id,
content=str(item.get("content", "")),
raw=item,
)
)
if not payload.get("has_more"):
break
page += 1
return rows

View file

@ -0,0 +1,280 @@
"""Client for ``/api/v1/threads`` and ``/api/v1/new_chat`` (SSE).
Verified against:
* ``surfsense_backend/app/routes/new_chat_routes.py:793-848`` (POST /threads)
* ``surfsense_backend/app/routes/new_chat_routes.py:1073-1142`` (DELETE /threads/{id})
* ``surfsense_backend/app/routes/new_chat_routes.py:1689-1800`` (POST /new_chat SSE)
* ``surfsense_backend/app/routes/new_chat_routes.py:191-220`` (THREAD_BUSY / TURN_CANCELLING 409)
* ``surfsense_backend/app/services/streaming/envelope/sse.py`` (wire framing)
* ``surfsense_backend/app/services/streaming/events/text.py`` (text-delta events)
* ``surfsense_backend/app/schemas/new_chat.py:234-288`` (NewChatRequest body)
The wire format is "Vercel AI SDK"-flavoured SSE with one event per
``data: <json>\n\n`` block (or the literal ``data: [DONE]\n\n``
terminator). Text deltas arrive as ``{"type":"text-delta","id":...,"delta":...}``
events; we accumulate them per ``id`` and emit the final concatenated
text plus parsed citations.
"""
from __future__ import annotations
import asyncio
import json
import logging
import time
from collections.abc import AsyncIterator, Sequence
from dataclasses import dataclass, field
from typing import Any
import httpx
from ..parse import iter_sse_events, parse_citations
logger = logging.getLogger(__name__)
@dataclass
class StreamedAnswer:
"""Result of a single ``/new_chat`` turn."""
text: str
raw_events: list[dict[str, Any]] = field(default_factory=list)
latency_ms: int = 0
user_message_id: str | None = None
assistant_message_id: str | None = None
finished_normally: bool = False
@property
def citations(self) -> list[dict[str, Any]]:
"""Parsed citation tokens (lazy; small enough to recompute)."""
return [token.to_dict() for token in parse_citations(self.text)]
class ThreadBusyError(RuntimeError):
"""Raised after exhausting retries on a 409 ``THREAD_BUSY`` / ``TURN_CANCELLING``."""
def __init__(self, error_code: str, message: str) -> None:
super().__init__(f"{error_code}: {message}")
self.error_code = error_code
class NewChatClient:
"""Thread create / delete / SSE ask."""
def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
self._http = http
self._base = base_url.rstrip("/")
# ------------------------------------------------------------------
# threads
# ------------------------------------------------------------------
async def create_thread(
self,
*,
search_space_id: int,
title: str = "eval",
archived: bool = False,
visibility: str = "PRIVATE",
) -> int:
response = await self._http.post(
f"{self._base}/api/v1/threads",
json={
"search_space_id": search_space_id,
"title": title,
"archived": archived,
"visibility": visibility,
},
headers={"Accept": "application/json"},
)
response.raise_for_status()
payload = response.json()
return int(payload["id"])
async def delete_thread(self, thread_id: int) -> None:
response = await self._http.delete(
f"{self._base}/api/v1/threads/{thread_id}",
headers={"Accept": "application/json"},
)
if response.status_code == 404:
return # idempotent
response.raise_for_status()
# ------------------------------------------------------------------
# /new_chat SSE
# ------------------------------------------------------------------
async def ask(
self,
*,
thread_id: int,
search_space_id: int,
user_query: str,
mentioned_document_ids: Sequence[int] | None = None,
disabled_tools: Sequence[str] | None = None,
max_busy_retries: int = 4,
timeout_s: float = 600.0,
) -> StreamedAnswer:
"""Stream a single turn and return the accumulated answer.
Honours backend ``THREAD_BUSY`` / ``TURN_CANCELLING`` 409
responses by sleeping for the ``Retry-After`` header (or the
``retry-after-ms`` header if present) and replaying. Bounded
by ``max_busy_retries`` so a stuck thread never blocks the
whole run.
"""
body: dict[str, Any] = {
"chat_id": thread_id,
"search_space_id": search_space_id,
"user_query": user_query,
}
if mentioned_document_ids:
body["mentioned_document_ids"] = list(mentioned_document_ids)
if disabled_tools:
body["disabled_tools"] = list(disabled_tools)
attempt = 0
while True:
try:
return await self._stream_once(body=body, timeout_s=timeout_s)
except ThreadBusyError as exc:
attempt += 1
if attempt > max_busy_retries:
raise
# Cap wait at 30s; backend retry hint is exponential anyway.
wait = min(30.0, 0.5 * (2 ** attempt))
logger.info(
"thread_id=%s busy (%s); retry %d/%d after %.1fs",
thread_id,
exc.error_code,
attempt,
max_busy_retries,
wait,
)
await asyncio.sleep(wait)
async def _stream_once(
self,
*,
body: dict[str, Any],
timeout_s: float,
) -> StreamedAnswer:
# Per-call timeout — the connect should be quick, the read needs
# to outlive the longest LLM completion.
timeout = httpx.Timeout(timeout_s, connect=10.0)
started = time.monotonic()
async with self._http.stream(
"POST",
f"{self._base}/api/v1/new_chat",
json=body,
headers={"Accept": "text/event-stream"},
timeout=timeout,
) as response:
if response.status_code == 409:
detail = await self._extract_busy_detail(response)
raise ThreadBusyError(
error_code=detail.get("errorCode", "THREAD_BUSY"),
message=detail.get("message", "Thread is busy"),
)
response.raise_for_status()
answer = await self._consume_sse(response)
answer.latency_ms = int((time.monotonic() - started) * 1000)
return answer
@staticmethod
async def _extract_busy_detail(response: httpx.Response) -> dict[str, Any]:
try:
payload = json.loads(await response.aread())
except (json.JSONDecodeError, ValueError):
return {"errorCode": "THREAD_BUSY", "message": response.text}
if isinstance(payload, dict) and isinstance(payload.get("detail"), dict):
return payload["detail"]
return payload if isinstance(payload, dict) else {}
@staticmethod
async def _consume_sse(response: httpx.Response) -> StreamedAnswer:
"""Walk SSE events, accumulate text-delta payloads.
Backend events of interest:
* ``{"type": "text-start", "id": ...}``
* ``{"type": "text-delta", "id": ..., "delta": ...}``
* ``{"type": "text-end", "id": ...}``
* ``{"type": "start", "messageId": ...}`` (top-level message id)
* ``{"type": "finish"}``
* literal ``[DONE]`` sentinel
Multiple ``text-start`` blocks can interleave each gets its
own ``id`` and we concatenate them in arrival order. That
mirrors the AI SDK client behaviour: one continuous assistant
message visible to the user.
"""
ordered_text_ids: list[str] = []
text_buffers: dict[str, list[str]] = {}
raw_events: list[dict[str, Any]] = []
user_message_id: str | None = None
assistant_message_id: str | None = None
finished = False
async for event in iter_sse_events(_aiter_lines(response)):
data = event.data
if data == "[DONE]":
finished = True
continue
try:
payload = json.loads(data)
except (json.JSONDecodeError, ValueError):
logger.debug("Skipping non-JSON SSE payload: %r", data[:120])
continue
if not isinstance(payload, dict):
continue
raw_events.append(payload)
ev_type = payload.get("type")
if ev_type == "text-delta":
tid = str(payload.get("id", ""))
delta = payload.get("delta", "")
if not isinstance(delta, str):
continue
if tid not in text_buffers:
text_buffers[tid] = []
ordered_text_ids.append(tid)
text_buffers[tid].append(delta)
elif ev_type == "text-start":
tid = str(payload.get("id", ""))
if tid and tid not in text_buffers:
text_buffers[tid] = []
ordered_text_ids.append(tid)
elif ev_type == "start":
msg_id = payload.get("messageId")
if isinstance(msg_id, str):
user_message_id = user_message_id or msg_id
elif ev_type == "data-user-message-id":
msg_id = (payload.get("data") or {}).get("id") or payload.get("id")
if isinstance(msg_id, str):
user_message_id = msg_id
elif ev_type == "data-assistant-message-id":
msg_id = (payload.get("data") or {}).get("id") or payload.get("id")
if isinstance(msg_id, str):
assistant_message_id = msg_id
elif ev_type == "finish":
finished = True
text = "".join("".join(text_buffers.get(tid, [])) for tid in ordered_text_ids)
return StreamedAnswer(
text=text,
raw_events=raw_events,
user_message_id=user_message_id,
assistant_message_id=assistant_message_id,
finished_normally=finished,
)
async def _aiter_lines(response: httpx.Response) -> AsyncIterator[str]:
"""Adapter so the parser can consume any line iterator (mockable in tests)."""
async for line in response.aiter_lines():
yield line

View file

@ -0,0 +1,207 @@
"""Client for ``/api/v1/searchspaces`` and ``/api/v1/search-spaces/{id}/llm-preferences``.
Verified against:
* ``surfsense_backend/app/routes/search_spaces_routes.py:116`` (POST create)
* ``surfsense_backend/app/routes/search_spaces_routes.py:234`` (GET by id)
* ``surfsense_backend/app/routes/search_spaces_routes.py:422`` (DELETE soft-delete)
* ``surfsense_backend/app/routes/search_spaces_routes.py:698-849`` (GET/PUT llm-preferences)
* ``surfsense_backend/app/schemas/search_space.py:14`` (SearchSpaceCreate body)
* ``surfsense_backend/app/routes/vision_llm_routes.py:60`` (GET global vision configs)
Note the inconsistent pluralisation in the backend: ``/searchspaces``
(no hyphen) for CRUD, but ``/search-spaces`` (hyphenated) for the
``llm-preferences`` sub-resource. Both are mirrored verbatim here.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
import httpx
@dataclass
class SearchSpaceRow:
"""Subset of the SearchSpace row we care about."""
id: int
name: str
description: str | None
user_id: str
citations_enabled: bool
qna_custom_instructions: str | None
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> SearchSpaceRow:
return cls(
id=int(payload["id"]),
name=str(payload["name"]),
description=payload.get("description"),
user_id=str(payload.get("user_id", "")),
citations_enabled=bool(payload.get("citations_enabled", True)),
qna_custom_instructions=payload.get("qna_custom_instructions"),
)
@dataclass
class VisionLlmConfigEntry:
"""Subset of one ``GET /global-vision-llm-configs`` row.
The backend returns negative ids for global / OpenRouter-derived
vision configs and positive ids for per-user BYOK rows. Either is
accepted by ``set_llm_preferences(vision_llm_config_id=...)``.
"""
id: int
name: str
provider: str
model_name: str
is_auto_mode: bool
raw: dict[str, Any]
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> VisionLlmConfigEntry:
return cls(
id=int(payload.get("id", 0)),
name=str(payload.get("name", "")),
provider=str(payload.get("provider", "")).upper(),
model_name=str(payload.get("model_name", "")),
is_auto_mode=bool(payload.get("is_auto_mode", False)),
raw=payload,
)
@dataclass
class LlmPreferences:
"""Resolved LLM preferences with the embedded full config row.
Mirrors ``LLMPreferencesRead`` from the backend so the lifecycle
command can introspect ``provider`` / ``model_name`` to validate the
OpenRouter pin.
"""
agent_llm_id: int | None
document_summary_llm_id: int | None
image_generation_config_id: int | None
vision_llm_config_id: int | None
agent_llm: dict[str, Any] | None
raw: dict[str, Any]
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> LlmPreferences:
return cls(
agent_llm_id=payload.get("agent_llm_id"),
document_summary_llm_id=payload.get("document_summary_llm_id"),
image_generation_config_id=payload.get("image_generation_config_id"),
vision_llm_config_id=payload.get("vision_llm_config_id"),
agent_llm=payload.get("agent_llm"),
raw=payload,
)
class SearchSpaceClient:
"""Thin wrapper around the SearchSpace + LLM preferences endpoints."""
def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
self._http = http
self._base = base_url.rstrip("/")
async def create(self, name: str, *, description: str | None = None) -> SearchSpaceRow:
body: dict[str, Any] = {"name": name}
if description is not None:
body["description"] = description
# citations_enabled defaults to True backend-side; keep that default.
response = await self._http.post(
f"{self._base}/api/v1/searchspaces",
json=body,
headers={"Accept": "application/json"},
)
response.raise_for_status()
return SearchSpaceRow.from_payload(response.json())
async def get(self, search_space_id: int) -> SearchSpaceRow:
response = await self._http.get(
f"{self._base}/api/v1/searchspaces/{search_space_id}",
headers={"Accept": "application/json"},
)
response.raise_for_status()
return SearchSpaceRow.from_payload(response.json())
async def delete(self, search_space_id: int) -> None:
"""Soft-delete: backend prefixes name with ``[DELETING]`` and dispatches a Celery cascade."""
response = await self._http.delete(
f"{self._base}/api/v1/searchspaces/{search_space_id}",
headers={"Accept": "application/json"},
)
# 404 means it's already gone — treat as success (idempotent teardown).
if response.status_code == 404:
return
response.raise_for_status()
async def get_llm_preferences(self, search_space_id: int) -> LlmPreferences:
response = await self._http.get(
f"{self._base}/api/v1/search-spaces/{search_space_id}/llm-preferences",
headers={"Accept": "application/json"},
)
response.raise_for_status()
return LlmPreferences.from_payload(response.json())
async def set_llm_preferences(
self,
search_space_id: int,
*,
agent_llm_id: int | None = None,
document_summary_llm_id: int | None = None,
image_generation_config_id: int | None = None,
vision_llm_config_id: int | None = None,
) -> LlmPreferences:
"""PUT a partial update to ``/search-spaces/{id}/llm-preferences``.
Backend uses ``model_dump(exclude_unset=True)`` so omitted fields
are left unchanged.
"""
body: dict[str, Any] = {}
if agent_llm_id is not None:
body["agent_llm_id"] = agent_llm_id
if document_summary_llm_id is not None:
body["document_summary_llm_id"] = document_summary_llm_id
if image_generation_config_id is not None:
body["image_generation_config_id"] = image_generation_config_id
if vision_llm_config_id is not None:
body["vision_llm_config_id"] = vision_llm_config_id
response = await self._http.put(
f"{self._base}/api/v1/search-spaces/{search_space_id}/llm-preferences",
json=body,
headers={"Accept": "application/json"},
)
response.raise_for_status()
return LlmPreferences.from_payload(response.json())
async def list_global_vision_llm_configs(self) -> list[VisionLlmConfigEntry]:
"""List the registered global vision LLM configs.
Used by ``setup`` to (a) resolve an explicit ``--vision-llm <slug>``
to a config id and (b) auto-pick the strongest registered vision
config when the operator doesn't pass one. The ``Auto (Fastest)``
entry (``id=0``) is filtered out accuracy must be reproducible.
"""
response = await self._http.get(
f"{self._base}/api/v1/global-vision-llm-configs",
headers={"Accept": "application/json"},
)
response.raise_for_status()
payload = response.json()
if not isinstance(payload, list):
raise RuntimeError(
f"Unexpected /global-vision-llm-configs payload: {payload!r}"
)
return [
VisionLlmConfigEntry.from_payload(item)
for item in payload
if not bool(item.get("is_auto_mode", False))
]

View file

@ -0,0 +1,279 @@
"""Environment + filesystem configuration for the harness.
Two responsibilities:
1. Load env vars (with sensible defaults) into a single immutable ``Config``
so that every other module reads it from one place.
2. Read / write ``data/state.json``. State is keyed by suite name so multiple
suites can be set up in parallel and torn down independently.
The pinned ``search_space_id`` lives in ``state.json`` (not env) so re-runs
are idempotent without forcing the operator to remember an integer.
"""
from __future__ import annotations
import json
import os
from collections.abc import Mapping
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from dotenv import load_dotenv
# Resolve once at import time. ``find_dotenv`` walks up; an explicit ``.env``
# at the package root or in CWD wins. Silent-no-op if neither exists.
load_dotenv()
_PROJECT_ROOT = Path(__file__).resolve().parents[3]
"""Resolves to ``surfsense_evals/`` (the package root, not ``src/``)."""
def _project_root() -> Path:
"""Return the ``surfsense_evals/`` project root.
Computed from this file's path: ``src/surfsense_evals/core/config.py`` →
walk up four levels. Kept as a function so tests can monkeypatch.
"""
return _PROJECT_ROOT
@dataclass(frozen=True)
class Config:
"""Immutable runtime configuration."""
surfsense_api_base: str
openrouter_api_key: str | None
openrouter_base_url: str
# Credentials — exactly ONE mode must be supplied.
surfsense_jwt: str | None
surfsense_refresh_token: str | None
surfsense_user_email: str | None
surfsense_user_password: str | None
# Filesystem paths.
data_dir: Path
reports_dir: Path
@property
def state_path(self) -> Path:
return self.data_dir / "state.json"
def has_jwt_mode(self) -> bool:
return bool(self.surfsense_jwt)
def has_local_mode(self) -> bool:
return bool(self.surfsense_user_email and self.surfsense_user_password)
def credential_mode(self) -> str:
"""Return ``"jwt"``, ``"local"``, or ``"none"`` (no credentials supplied)."""
if self.has_jwt_mode():
return "jwt"
if self.has_local_mode():
return "local"
return "none"
def suite_data_dir(self, suite: str) -> Path:
return self.data_dir / suite
def suite_reports_dir(self, suite: str) -> Path:
return self.reports_dir / suite
def suite_runs_dir(self, suite: str) -> Path:
return self.suite_data_dir(suite) / "runs"
def suite_maps_dir(self, suite: str) -> Path:
return self.suite_data_dir(suite) / "maps"
def load_config() -> Config:
"""Read the current process env into a ``Config``.
No validation is performed here; callers (e.g. ``auth.acquire_token``,
``cli`` subcommands) decide which fields they require. This keeps
``models list`` and ``suites list`` runnable without OpenRouter creds.
"""
project_root = _project_root()
data_dir = Path(os.environ.get("EVAL_DATA_DIR") or (project_root / "data")).resolve()
reports_dir = Path(os.environ.get("EVAL_REPORTS_DIR") or (project_root / "reports")).resolve()
return Config(
surfsense_api_base=os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/"),
openrouter_api_key=os.environ.get("OPENROUTER_API_KEY") or None,
openrouter_base_url=os.environ.get(
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
).rstrip("/"),
surfsense_jwt=os.environ.get("SURFSENSE_JWT") or None,
surfsense_refresh_token=os.environ.get("SURFSENSE_REFRESH_TOKEN") or None,
surfsense_user_email=os.environ.get("SURFSENSE_USER_EMAIL") or None,
surfsense_user_password=os.environ.get("SURFSENSE_USER_PASSWORD") or None,
data_dir=data_dir,
reports_dir=reports_dir,
)
# ---------------------------------------------------------------------------
# state.json — per-suite slots
# ---------------------------------------------------------------------------
# Scenario names — chosen at ``setup`` time, persisted in ``state.json``.
#
# * ``head-to-head`` (default, current behaviour): both arms answer with the
# SAME slug pinned via ``--provider-model``. Vision LLM at ingest is
# optional but recommended for image-bearing benchmarks.
# * ``symmetric-cheap``: both arms answer with the SAME (cheap, text-only)
# slug; SurfSense pre-extracted images at ingest with a vision LLM.
# Measures whether vision-RAG ingestion lets a cheap downstream model
# match a vision one. Native arm structurally loses on image questions —
# that's the point, and the report labels it accordingly.
# * ``cost-arbitrage``: native arm answers with an EXPENSIVE vision slug
# (``--native-arm-model``), SurfSense answers with a CHEAP text-only slug
# (``--provider-model``) over chunks the vision LLM already extracted at
# ingest. Measures how close SurfSense gets to native at a fraction of
# the per-query cost. The most compelling "shines" framing.
SCENARIOS: tuple[str, ...] = ("head-to-head", "symmetric-cheap", "cost-arbitrage")
DEFAULT_SCENARIO: str = "head-to-head"
@dataclass
class SuiteState:
"""Per-suite persisted state.
``provider_model`` is the slug pinned to the SearchSpace's
``agent_llm`` what answers SurfSense queries (and what the native
arm uses too, unless ``native_arm_model`` is set for cost-arbitrage).
``vision_provider_model`` is the slug of the OpenRouter vision LLM
config attached to the SearchSpace's ``vision_llm_config_id`` — what
SurfSense uses to extract image content at ingest time when
``use_vision_llm=True``. ``None`` means no vision config was attached
at setup (legacy or text-only suite).
"""
search_space_id: int
agent_llm_id: int
provider_model: str
created_at: str
ingestion_maps: dict[str, str] = field(default_factory=dict)
scenario: str = DEFAULT_SCENARIO
vision_llm_config_id: int | None = None
vision_provider_model: str | None = None
native_arm_model: str | None = None
def to_dict(self) -> dict[str, Any]:
return {
"search_space_id": self.search_space_id,
"agent_llm_id": self.agent_llm_id,
"provider_model": self.provider_model,
"created_at": self.created_at,
"ingestion_maps": dict(self.ingestion_maps),
"scenario": self.scenario,
"vision_llm_config_id": self.vision_llm_config_id,
"vision_provider_model": self.vision_provider_model,
"native_arm_model": self.native_arm_model,
}
@classmethod
def from_dict(cls, payload: Mapping[str, Any]) -> SuiteState:
# ``scenario`` / vision / native fields default for back-compat with
# ``state.json`` written before scenarios shipped.
scenario = str(payload.get("scenario") or DEFAULT_SCENARIO)
if scenario not in SCENARIOS:
scenario = DEFAULT_SCENARIO
raw_vision_id = payload.get("vision_llm_config_id")
return cls(
search_space_id=int(payload["search_space_id"]),
agent_llm_id=int(payload["agent_llm_id"]),
provider_model=str(payload["provider_model"]),
created_at=str(payload.get("created_at") or ""),
ingestion_maps=dict(payload.get("ingestion_maps") or {}),
scenario=scenario,
vision_llm_config_id=int(raw_vision_id) if raw_vision_id is not None else None,
vision_provider_model=(
str(payload["vision_provider_model"])
if payload.get("vision_provider_model")
else None
),
native_arm_model=(
str(payload["native_arm_model"])
if payload.get("native_arm_model")
else None
),
)
@property
def effective_native_arm_model(self) -> str:
"""Slug the native arm should use; falls back to ``provider_model``."""
return self.native_arm_model or self.provider_model
def _load_state(config: Config) -> dict[str, Any]:
if not config.state_path.exists():
return {"suites": {}}
try:
with config.state_path.open("r", encoding="utf-8") as fh:
data = json.load(fh)
except (OSError, json.JSONDecodeError) as exc:
raise RuntimeError(
f"Failed to read state file {config.state_path}: {exc!s}. "
"Delete it if you want to start fresh."
) from exc
if not isinstance(data, dict) or "suites" not in data:
return {"suites": {}}
return data
def _write_state(config: Config, payload: Mapping[str, Any]) -> None:
config.data_dir.mkdir(parents=True, exist_ok=True)
tmp = config.state_path.with_suffix(".json.tmp")
with tmp.open("w", encoding="utf-8") as fh:
json.dump(dict(payload), fh, indent=2, sort_keys=True)
fh.write("\n")
tmp.replace(config.state_path)
def get_suite_state(config: Config, suite: str) -> SuiteState | None:
"""Return ``SuiteState`` for ``suite`` or ``None`` if not set up."""
state = _load_state(config)
raw = (state.get("suites") or {}).get(suite)
if not raw:
return None
return SuiteState.from_dict(raw)
def set_suite_state(config: Config, suite: str, suite_state: SuiteState) -> None:
"""Persist ``suite_state`` under the suite slot. Other suites are untouched."""
state = _load_state(config)
suites = dict(state.get("suites") or {})
suites[suite] = suite_state.to_dict()
state["suites"] = suites
_write_state(config, state)
def clear_suite_state(config: Config, suite: str) -> bool:
"""Remove the slot for ``suite``. Returns ``True`` if removal happened."""
state = _load_state(config)
suites = dict(state.get("suites") or {})
if suite not in suites:
return False
del suites[suite]
state["suites"] = suites
_write_state(config, state)
return True
def utc_iso_timestamp() -> str:
"""Filesystem-safe UTC ISO timestamp, e.g. ``2026-05-11T20-30-00Z``."""
return datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")

View file

@ -0,0 +1,311 @@
"""Per-upload ingestion settings shared across every benchmark.
The SurfSense ``POST /api/v1/documents/fileupload`` endpoint exposes
exactly three knobs (verified at
``surfsense_backend/app/routes/documents_routes.py`` and
``surfsense_backend/app/etl_pipeline/etl_document.py``):
* ``processing_mode`` ``"basic"`` (default) | ``"premium"``
* ``use_vision_llm`` ``bool`` (run vision LLM during ingest to
extract image content / captions / tables)
* ``should_summarize`` ``bool`` (generate document summary)
This module gives every benchmark a uniform way to:
1. Receive sensible per-benchmark defaults (text-only benchmarks
default vision off; image-bearing benchmarks default vision on).
2. Accept CLI overrides (``--use-vision-llm`` / ``--no-vision-llm``,
``--processing-mode {basic,premium}``,
``--should-summarize`` / ``--no-summarize``).
3. Persist the *actual* settings used into the doc-map manifest and
the run artifact so reports can show "vision=ON, mode=premium →
65% accuracy" head-to-head with "vision=OFF, mode=basic 52%".
A/B testing on the same corpus
------------------------------
SurfSense dedupes uploads by ``(filename, search_space_id)`` NOT by
content hash and NOT by ingestion settings. Re-uploading the same
filename to the same SearchSpace with a different ``use_vision_llm``
flag will hit the duplicate branch and *not* re-process. To compare
two settings combos head-to-head on the same corpus you must give
each combo its own SearchSpace, which today means:
teardown --suite <s>
setup --suite <s> ...
ingest <s> <bench> --no-vision-llm # baseline run
run <s> <bench>
teardown --suite <s>
setup --suite <s> ...
ingest <s> <bench> --use-vision-llm # vision arm
run <s> <bench>
The runs land in different timestamped subdirectories under
``data/<suite>/runs/`` and ``report --suite <s>`` aggregates whichever
manifest is currently latest per benchmark.
"""
from __future__ import annotations
import argparse
import json
from collections.abc import Mapping
from dataclasses import dataclass
from pathlib import Path
from typing import Any
# Keep the constant list of valid processing modes here so benchmarks
# don't have to re-import from the backend (they don't have access to
# the backend package anyway).
PROCESSING_MODES: tuple[str, ...] = ("basic", "premium")
@dataclass(frozen=True)
class IngestSettings:
"""Resolved per-upload knobs handed to ``DocumentsClient.upload``.
Use ``IngestSettings(...)`` directly to define benchmark defaults,
or ``IngestSettings.merge(defaults, opts)`` to apply CLI overrides
on top of those defaults.
"""
use_vision_llm: bool = False
processing_mode: str = "basic"
should_summarize: bool = False
def to_dict(self) -> dict[str, Any]:
return {
"use_vision_llm": self.use_vision_llm,
"processing_mode": self.processing_mode,
"should_summarize": self.should_summarize,
}
@classmethod
def merge(cls, defaults: IngestSettings, opts: Mapping[str, Any]) -> IngestSettings:
"""Apply CLI overrides on top of ``defaults``.
``opts`` is the kwargs dict built by ``core.cli`` from the
argparse namespace (see ``_cmd_ingest`` / ``_cmd_run``). Keys
we look for: ``use_vision_llm`` (bool or None), ``processing_mode``
(str or None), ``should_summarize`` (bool or None). Anything
else is ignored so benchmarks can pass through their own opts.
"""
return cls(
use_vision_llm=_coerce_bool(opts.get("use_vision_llm"), defaults.use_vision_llm),
processing_mode=_coerce_mode(opts.get("processing_mode"), defaults.processing_mode),
should_summarize=_coerce_bool(opts.get("should_summarize"), defaults.should_summarize),
)
def render_label(self) -> str:
"""Human-readable single-line label for reports / log lines."""
return (
f"vision={'on' if self.use_vision_llm else 'off'}, "
f"mode={self.processing_mode}, "
f"summarize={'on' if self.should_summarize else 'off'}"
)
def _coerce_bool(value: Any, default: bool) -> bool:
"""Argparse with ``BooleanOptionalAction`` yields True/False/None.
``None`` means the operator didn't pass the flag → fall back to
the benchmark default.
"""
if value is None:
return default
if isinstance(value, bool):
return value
if isinstance(value, str):
return value.strip().lower() in {"1", "true", "yes", "on"}
return bool(value)
def _coerce_mode(value: Any, default: str) -> str:
if value is None or value == "":
return default
val = str(value).strip().lower()
if val not in PROCESSING_MODES:
raise ValueError(
f"Invalid processing_mode {val!r}; must be one of {PROCESSING_MODES}"
)
return val
# ---------------------------------------------------------------------------
# Argparse helper
# ---------------------------------------------------------------------------
def _add_bool_pair(
parser: argparse.ArgumentParser,
*,
dest: str,
on_flag: str,
off_flag: str,
on_help: str,
off_help: str,
) -> None:
"""Add a mutually exclusive ``--foo`` / ``--no-foo`` pair.
We don't use ``argparse.BooleanOptionalAction`` because it would
auto-generate ``--no-use-vision-llm`` rather than the friendlier
``--no-vision-llm`` that operators reach for. Default is ``None``
so ``IngestSettings.merge`` can distinguish "silent" from
"explicit false".
"""
group = parser.add_mutually_exclusive_group()
group.add_argument(
on_flag,
dest=dest,
action="store_true",
default=None,
help=on_help,
)
group.add_argument(
off_flag,
dest=dest,
action="store_false",
default=None,
help=off_help,
)
def add_ingest_settings_args(
parser: argparse.ArgumentParser,
*,
defaults: IngestSettings,
) -> None:
"""Attach the three ingest-settings flag pairs to ``parser``.
Each bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
pair so an operator can flip either direction without restating
every flag. Default is ``None`` so that "operator didn't pass the
flag" is distinguishable from "operator explicitly passed false"
``IngestSettings.merge`` then folds in the benchmark default
only when the operator was silent.
"""
settings_group = parser.add_argument_group(
"ingest settings",
f"Per-upload knobs (forwarded to /documents/fileupload). "
f"Defaults for this benchmark: {defaults.render_label()}.",
)
_add_bool_pair(
settings_group,
dest="use_vision_llm",
on_flag="--use-vision-llm",
off_flag="--no-vision-llm",
on_help=(
"Run vision LLM during ingest to extract image content "
f"(default for this benchmark: "
f"{'on' if defaults.use_vision_llm else 'off'})."
),
off_help="Skip vision LLM during ingest (text-only ETL).",
)
settings_group.add_argument(
"--processing-mode",
dest="processing_mode",
choices=PROCESSING_MODES,
default=None,
help=(
"SurfSense ETL processing mode (premium uses a 10x page "
f"multiplier and typically routes to a stronger ETL). "
f"Default for this benchmark: {defaults.processing_mode!r}."
),
)
_add_bool_pair(
settings_group,
dest="should_summarize",
on_flag="--should-summarize",
off_flag="--no-summarize",
on_help=(
"Have SurfSense generate a document summary at ingest "
f"(default for this benchmark: "
f"{'on' if defaults.should_summarize else 'off'})."
),
off_help="Skip per-document summary generation.",
)
# ---------------------------------------------------------------------------
# Doc-map manifest helpers
# ---------------------------------------------------------------------------
#
# Every benchmark writes a doc-map JSONL under ``data/<suite>/maps/`` that
# pairs source identifiers (case_id, snippet_id, doc_path, …) to the
# SurfSense document_ids returned by the upload. To make the report
# self-describing we also write a header line:
#
# {"__settings__": {"use_vision_llm": ..., "processing_mode": ..., ...}}
#
# These two helpers centralise that protocol so each benchmark only has to
# call ``write_settings_header`` and ``read_settings_header``.
SETTINGS_HEADER_KEY = "__settings__"
def settings_header_line(settings: IngestSettings) -> str:
"""Return the JSON-serialised header line (no trailing newline)."""
return json.dumps({SETTINGS_HEADER_KEY: settings.to_dict()})
def is_settings_header(row: Mapping[str, Any]) -> bool:
return SETTINGS_HEADER_KEY in row
def read_settings_header(map_path: Path) -> dict[str, Any]:
"""Read the ``__settings__`` header out of a doc-map JSONL.
Returns ``{}`` on a missing file, an empty file, an unreadable
file, or a file whose first non-blank line is not a settings
header (e.g. a corpus ingested before this feature existed).
Callers use this purely to surface settings in the report; it
must never fail the run.
"""
if not map_path.exists():
return {}
try:
with map_path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
if isinstance(row, dict) and SETTINGS_HEADER_KEY in row:
return dict(row[SETTINGS_HEADER_KEY])
return {}
except (OSError, json.JSONDecodeError):
return {}
return {}
def format_ingest_settings_md(settings: Any) -> str:
"""Render the resolved settings as a single Markdown bullet line."""
if not isinstance(settings, Mapping) or not settings:
return "- SurfSense ingest settings: (not recorded — re-ingest to capture)"
vision = "on" if settings.get("use_vision_llm") else "off"
mode = settings.get("processing_mode") or "basic"
summarize = "on" if settings.get("should_summarize") else "off"
return (
f"- SurfSense ingest settings: vision_llm=`{vision}`, "
f"processing_mode=`{mode}`, summarize=`{summarize}`"
)
__all__ = [
"PROCESSING_MODES",
"SETTINGS_HEADER_KEY",
"IngestSettings",
"add_ingest_settings_args",
"format_ingest_settings_md",
"is_settings_header",
"read_settings_header",
"settings_header_line",
]

View file

@ -0,0 +1,50 @@
"""Pure-function metric primitives. Lazy imports."""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING: # pragma: no cover
from .comparison import McnemarResult, bootstrap_delta_ci, mcnemar_test, paired_aggregate
from .mc_accuracy import AccuracyResult, accuracy_with_wilson_ci, wilson_ci
from .retrieval import RetrievalScores, mrr, ndcg_at_k, recall_at_k, score_run
__all__ = [
"AccuracyResult",
"McnemarResult",
"RetrievalScores",
"accuracy_with_wilson_ci",
"bootstrap_delta_ci",
"mcnemar_test",
"mrr",
"ndcg_at_k",
"paired_aggregate",
"recall_at_k",
"score_run",
"wilson_ci",
]
_MODULE_FOR = {
"AccuracyResult": "mc_accuracy",
"accuracy_with_wilson_ci": "mc_accuracy",
"wilson_ci": "mc_accuracy",
"RetrievalScores": "retrieval",
"mrr": "retrieval",
"ndcg_at_k": "retrieval",
"recall_at_k": "retrieval",
"score_run": "retrieval",
"McnemarResult": "comparison",
"bootstrap_delta_ci": "comparison",
"mcnemar_test": "comparison",
"paired_aggregate": "comparison",
}
def __getattr__(name: str):
if name in _MODULE_FOR:
from importlib import import_module
mod = import_module(f".{_MODULE_FOR[name]}", __name__)
return getattr(mod, name)
raise AttributeError(f"module 'surfsense_evals.core.metrics' has no attribute {name!r}")

View file

@ -0,0 +1,258 @@
"""Paired comparison statistics for head-to-head benchmarks.
In every head-to-head benchmark (currently MedXpertQA-MM and
MMLongBench-Doc) each question is answered by both arms (Native PDF
and SurfSense). That makes per-question outcomes paired, so
``McNemar's test`` on the discordant pairs is the right significance
test for "are the two arms different?". We also expose a bootstrap
delta CI for visualising effect size.
Aggregate cost / latency / token deltas are mean-based; the runner
slices them by arm before passing them in.
"""
from __future__ import annotations
import math
import statistics
from collections.abc import Sequence
from dataclasses import dataclass
import numpy as np
@dataclass(frozen=True)
class McnemarResult:
"""Discordant pair counts + the test statistics."""
n_total: int
b: int # native correct, surfsense wrong
c: int # native wrong, surfsense correct
statistic: float
p_value: float
method: str
def to_dict(self) -> dict[str, float | int | str]:
return {
"n_total": self.n_total,
"b_native_correct_only": self.b,
"c_surfsense_correct_only": self.c,
"statistic": self.statistic,
"p_value": self.p_value,
"method": self.method,
}
def mcnemar_test(
arm_a_correct: Sequence[bool],
arm_b_correct: Sequence[bool],
*,
use_exact_below: int = 11,
) -> McnemarResult:
"""Paired McNemar's test on per-question correctness.
``arm_a_correct`` is treated as the reference arm (typically the
"native" arm); ``arm_b_correct`` is the challenger (typically
"surfsense"). The test statistic only depends on discordant pairs.
Default switch-over (``b + c < 11``): for very small discordant
samples the exact binomial test is preferred; above that the
continuity-corrected chi-square is well-behaved (Edwards 1948).
Callers can raise ``use_exact_below`` if they prefer the more
conservative ``b + c < 25`` rule.
No external statistical package is required: scipy is a heavy dep
and we only need binomial CDFs / chi-square sf, both implementable
in stdlib + numpy without surprises.
"""
if len(arm_a_correct) != len(arm_b_correct):
raise ValueError(
f"Length mismatch: arm_a={len(arm_a_correct)}, arm_b={len(arm_b_correct)}"
)
n = len(arm_a_correct)
b = sum(1 for a, c in zip(arm_a_correct, arm_b_correct) if a and not c)
c = sum(1 for a, cc in zip(arm_a_correct, arm_b_correct) if (not a) and cc)
discordant = b + c
if discordant == 0:
return McnemarResult(
n_total=n, b=b, c=c, statistic=0.0, p_value=1.0, method="degenerate"
)
if discordant < use_exact_below:
# Exact binomial: under H0 each discordant pair is a Bernoulli(0.5).
# p-value = 2 * P(X <= min(b,c) | n=discordant, p=0.5), capped at 1.
k = min(b, c)
cdf = sum(_binom_pmf(discordant, i) for i in range(k + 1))
p_value = min(1.0, 2.0 * cdf)
return McnemarResult(
n_total=n, b=b, c=c, statistic=float(k), p_value=p_value, method="exact"
)
# Chi-square with continuity correction (McNemar-Edwards).
chi = ((abs(b - c) - 1) ** 2) / discordant
p_value = _chi2_sf(chi, df=1)
return McnemarResult(
n_total=n, b=b, c=c, statistic=chi, p_value=p_value, method="chi2_cc"
)
def _binom_pmf(n: int, k: int) -> float:
return math.comb(n, k) * (0.5 ** n)
def _chi2_sf(x: float, *, df: int) -> float:
"""Survival function (1 - CDF) of chi-square; df=1 closed form."""
if x <= 0:
return 1.0
if df == 1:
# Chi^2(1) = N(0,1)^2; sf(x) = 2 * Phi_complement(sqrt(x))
return math.erfc(math.sqrt(x / 2.0))
# General fallback via regularized upper incomplete gamma.
a = df / 2.0
z = x / 2.0
return _gammaincc(a, z)
def _gammaincc(a: float, x: float, *, max_iter: int = 200, tol: float = 1e-12) -> float:
"""Regularised upper incomplete gamma Q(a, x). Series + continued fraction."""
if x < 0 or a <= 0:
return float("nan")
if x == 0:
return 1.0
if x < a + 1.0:
# Series for P(a, x); subtract from 1.
p_series = _gammainc_series(a, x, max_iter=max_iter, tol=tol)
return 1.0 - p_series
return _gammaincc_cf(a, x, max_iter=max_iter, tol=tol)
def _gammainc_series(a: float, x: float, *, max_iter: int, tol: float) -> float:
term = 1.0 / a
summation = term
for n in range(1, max_iter):
term *= x / (a + n)
summation += term
if abs(term) < abs(summation) * tol:
break
log_pre = -x + a * math.log(x) - math.lgamma(a)
return summation * math.exp(log_pre)
def _gammaincc_cf(a: float, x: float, *, max_iter: int, tol: float) -> float:
b = x + 1.0 - a
c_val = 1.0 / 1e-300
d = 1.0 / b
h = d
for i in range(1, max_iter):
an = -i * (i - a)
b += 2.0
d = an * d + b
if abs(d) < 1e-300:
d = 1e-300
c_val = b + an / c_val
if abs(c_val) < 1e-300:
c_val = 1e-300
d = 1.0 / d
delta = d * c_val
h *= delta
if abs(delta - 1.0) < tol:
break
log_pre = -x + a * math.log(x) - math.lgamma(a)
return h * math.exp(log_pre)
# ---------------------------------------------------------------------------
# Bootstrap delta CI
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class BootstrapDelta:
delta: float
ci_low: float
ci_high: float
n_resamples: int
def to_dict(self) -> dict[str, float | int]:
return {
"delta": self.delta,
"ci_low": self.ci_low,
"ci_high": self.ci_high,
"n_resamples": self.n_resamples,
}
def bootstrap_delta_ci(
arm_a_correct: Sequence[bool],
arm_b_correct: Sequence[bool],
*,
n_resamples: int = 5000,
level: float = 0.95,
random_state: int | None = 0,
) -> BootstrapDelta:
"""Paired-sample bootstrap CI for ``mean(arm_b) - mean(arm_a)``.
Resamples *paired indices* with replacement so the dependency
between arms is preserved.
"""
if len(arm_a_correct) != len(arm_b_correct):
raise ValueError("paired arms must have the same length")
n = len(arm_a_correct)
if n == 0:
return BootstrapDelta(0.0, 0.0, 0.0, 0)
a = np.asarray(arm_a_correct, dtype=np.int8)
b = np.asarray(arm_b_correct, dtype=np.int8)
delta = float(b.mean() - a.mean())
rng = np.random.default_rng(random_state)
deltas = np.empty(n_resamples, dtype=np.float64)
for i in range(n_resamples):
idx = rng.integers(0, n, size=n)
deltas[i] = b[idx].mean() - a[idx].mean()
alpha = (1.0 - level) / 2.0
ci_low, ci_high = float(np.quantile(deltas, alpha)), float(np.quantile(deltas, 1 - alpha))
return BootstrapDelta(delta=delta, ci_low=ci_low, ci_high=ci_high, n_resamples=n_resamples)
# ---------------------------------------------------------------------------
# Simple aggregate helpers (cost / latency / tokens)
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class Aggregate:
mean: float
median: float
p95: float
n: int
def to_dict(self) -> dict[str, float | int]:
return {"mean": self.mean, "median": self.median, "p95": self.p95, "n": self.n}
def paired_aggregate(values: Sequence[float]) -> Aggregate:
"""Mean / median / p95 of a list of numbers (e.g. cost-per-question)."""
if not values:
return Aggregate(0.0, 0.0, 0.0, 0)
arr = np.asarray(values, dtype=np.float64)
return Aggregate(
mean=float(arr.mean()),
median=float(statistics.median(values)),
p95=float(np.quantile(arr, 0.95)),
n=len(values),
)
__all__ = [
"Aggregate",
"BootstrapDelta",
"McnemarResult",
"bootstrap_delta_ci",
"mcnemar_test",
"paired_aggregate",
]

View file

@ -0,0 +1,130 @@
"""Multiple-choice accuracy + Wilson 95% confidence intervals.
Wilson CI is preferred over normal-approximation because MIRAGE's
per-task subsets can be small (PubMedQA* and BioASQ-Y/N have a few
hundred questions each) and Wilson handles n0 / p{0,1} edges
gracefully.
Reference for the closed form: Wilson (1927); identical to the
``statsmodels.stats.proportion.proportion_confint(method='wilson')``
output and what scikit-learn implements internally for its bounded
estimators.
"""
from __future__ import annotations
import math
from collections.abc import Mapping, Sequence
from dataclasses import dataclass
@dataclass(frozen=True)
class AccuracyResult:
"""Per-task accuracy with Wilson CI."""
n_correct: int
n_total: int
accuracy: float
ci_low: float
ci_high: float
def to_dict(self) -> dict[str, float | int]:
return {
"n_correct": self.n_correct,
"n_total": self.n_total,
"accuracy": self.accuracy,
"ci_low": self.ci_low,
"ci_high": self.ci_high,
}
# Two-sided Wilson z values. 1.959964 ≈ z_{0.975}.
_Z_FOR_LEVEL: dict[float, float] = {
0.90: 1.6448536269514722,
0.95: 1.959963984540054,
0.99: 2.5758293035489004,
}
def wilson_ci(
n_correct: int, n_total: int, *, level: float = 0.95
) -> tuple[float, float]:
"""Two-sided Wilson score confidence interval for a proportion.
Returns ``(low, high)``. ``n_total == 0`` returns ``(0.0, 1.0)``
the maximally uncertain interval.
"""
if n_total <= 0:
return 0.0, 1.0
if level not in _Z_FOR_LEVEL:
raise ValueError(f"Unsupported confidence level {level!r}")
z = _Z_FOR_LEVEL[level]
p = n_correct / n_total
n = n_total
denom = 1.0 + (z * z) / n
centre = (p + (z * z) / (2 * n)) / denom
half = (z / denom) * math.sqrt((p * (1 - p) / n) + (z * z) / (4 * n * n))
low = max(0.0, centre - half)
high = min(1.0, centre + half)
return low, high
def accuracy_with_wilson_ci(
n_correct: int, n_total: int, *, level: float = 0.95
) -> AccuracyResult:
if n_total < 0:
raise ValueError(f"n_total must be >= 0, got {n_total}")
if n_correct < 0 or n_correct > n_total:
raise ValueError(
f"n_correct must be in [0, n_total]; got n_correct={n_correct}, n_total={n_total}"
)
accuracy = (n_correct / n_total) if n_total > 0 else 0.0
low, high = wilson_ci(n_correct, n_total, level=level)
return AccuracyResult(
n_correct=n_correct,
n_total=n_total,
accuracy=accuracy,
ci_low=low,
ci_high=high,
)
def per_task_accuracy(
rows: Sequence[Mapping[str, object]],
*,
task_key: str = "task",
correct_key: str = "is_correct",
level: float = 0.95,
) -> dict[str, AccuracyResult]:
"""Group ``rows`` by ``task_key`` and compute per-task ``AccuracyResult``.
``rows[i][correct_key]`` must be truthy iff the answer was correct.
"""
counts: dict[str, list[int]] = {}
for row in rows:
task = str(row.get(task_key, ""))
bucket = counts.setdefault(task, [0, 0])
bucket[1] += 1
if row.get(correct_key):
bucket[0] += 1
return {
task: accuracy_with_wilson_ci(c[0], c[1], level=level)
for task, c in counts.items()
}
def macro_accuracy(per_task: Mapping[str, AccuracyResult]) -> float:
if not per_task:
return 0.0
return sum(r.accuracy for r in per_task.values()) / len(per_task)
__all__ = [
"AccuracyResult",
"accuracy_with_wilson_ci",
"macro_accuracy",
"per_task_accuracy",
"wilson_ci",
]

View file

@ -0,0 +1,132 @@
"""Retrieval metrics: Recall@k, MRR, nDCG@k.
Used by CUREv1's runner to score the SurfSense arm against the
benchmark's qrels. ``corpus_id`` is the canonical CUREv1 passage id
(string); the runner maps SurfSense ``chunk_id`` ``document_id``
``corpus_id`` before calling these.
Graded relevance (CUREv1 uses 0/1/2 grades) is honoured by ``ndcg_at_k``;
``recall_at_k`` and ``mrr`` flatten anything > 0 to "relevant".
"""
from __future__ import annotations
import math
from collections.abc import Iterable, Mapping, Sequence
from dataclasses import dataclass
@dataclass(frozen=True)
class RetrievalScores:
"""Aggregated retrieval scores."""
recall_at_k: dict[int, float]
mrr: float
ndcg_at_10: float
n_queries: int
def to_dict(self) -> dict:
return {
"recall_at_k": dict(self.recall_at_k),
"mrr": self.mrr,
"ndcg_at_10": self.ndcg_at_10,
"n_queries": self.n_queries,
}
def recall_at_k(retrieved: Sequence[str], relevant: Iterable[str], k: int) -> float:
"""Fraction of ``relevant`` documents found in ``retrieved[:k]``."""
if not relevant:
return 0.0
relevant_set = set(relevant)
if not relevant_set:
return 0.0
top_k = list(retrieved)[:k]
hits = sum(1 for doc in top_k if doc in relevant_set)
return hits / len(relevant_set)
def mrr(retrieved: Sequence[str], relevant: Iterable[str]) -> float:
"""Reciprocal rank of the first relevant doc, 0 if none found."""
relevant_set = set(relevant)
for rank, doc in enumerate(retrieved, start=1):
if doc in relevant_set:
return 1.0 / rank
return 0.0
def _dcg_at_k(grades: Sequence[float], k: int) -> float:
s = 0.0
for i, grade in enumerate(grades[:k], start=1):
# Standard log-base-2 discount; gain = 2^grade - 1 for graded relevance.
s += (2.0 ** grade - 1.0) / math.log2(i + 1)
return s
def ndcg_at_k(
retrieved: Sequence[str],
qrels: Mapping[str, float],
k: int,
) -> float:
"""nDCG@k against graded ``qrels`` (``{doc_id: grade}``).
Unjudged documents in ``retrieved`` contribute zero gain. The
ideal ordering is ``qrels`` sorted by grade descending.
"""
if not qrels:
return 0.0
grades = [float(qrels.get(doc, 0.0)) for doc in retrieved]
dcg = _dcg_at_k(grades, k)
ideal = sorted(qrels.values(), reverse=True)
idcg = _dcg_at_k([float(g) for g in ideal], k)
if idcg == 0.0:
return 0.0
return dcg / idcg
def score_run(
*,
per_query_retrieved: Mapping[str, Sequence[str]],
per_query_qrels: Mapping[str, Mapping[str, float]],
ks: Sequence[int] = (1, 5, 10, 32),
ndcg_k: int = 10,
) -> RetrievalScores:
"""Aggregate Recall@k, MRR, nDCG@k across a run.
``per_query_retrieved`` maps ``query_id -> ordered list of doc ids``.
``per_query_qrels`` maps ``query_id -> {doc_id: grade}`` (grade > 0
is relevant).
Queries present in retrieved but not in qrels are skipped. Queries
in qrels but missing from retrieved contribute zeros.
"""
qids = set(per_query_qrels.keys()) & set(per_query_retrieved.keys())
if not qids:
return RetrievalScores(recall_at_k={k: 0.0 for k in ks}, mrr=0.0, ndcg_at_10=0.0, n_queries=0)
recall_totals = {k: 0.0 for k in ks}
mrr_total = 0.0
ndcg_total = 0.0
for qid in qids:
retrieved = list(per_query_retrieved[qid])
qrels = per_query_qrels[qid]
relevant_docs = [d for d, g in qrels.items() if g > 0]
for k in ks:
recall_totals[k] += recall_at_k(retrieved, relevant_docs, k)
mrr_total += mrr(retrieved, relevant_docs)
ndcg_total += ndcg_at_k(retrieved, qrels, ndcg_k)
n = len(qids)
return RetrievalScores(
recall_at_k={k: v / n for k, v in recall_totals.items()},
mrr=mrr_total / n,
ndcg_at_10=ndcg_total / n,
n_queries=n,
)
__all__ = ["RetrievalScores", "mrr", "ndcg_at_k", "recall_at_k", "score_run"]

View file

@ -0,0 +1,21 @@
"""Parsers shared across suites: citations, MCQ envelopes, AI-SDK SSE."""
from __future__ import annotations
from .answer_letter import AnswerLetterResult, extract_answer_letter
from .citations import CITATION_REGEX, CitationToken, ChunkCitation, UrlCitation, parse_citations
from .freeform_answer import extract_freeform_answer
from .sse import SseEvent, iter_sse_events
__all__ = [
"CITATION_REGEX",
"CitationToken",
"ChunkCitation",
"UrlCitation",
"parse_citations",
"AnswerLetterResult",
"extract_answer_letter",
"extract_freeform_answer",
"SseEvent",
"iter_sse_events",
]

View file

@ -0,0 +1,122 @@
"""Robust extractor for MCQ answer letters.
Handles three answer shapes seen in the wild:
1. **MedRAG envelope** ``{"step_by_step_thinking": "...", "answer_choice": "A"}``
embedded somewhere in the assistant message (often inside ```` ```json ```` /
``` ``` ``` fences). The regex grabs the JSON object and reads the
``answer_choice`` field.
2. **Final-line letter** e.g. ``Answer: B`` or ``The correct answer is (C).``.
Falls back to a permissive regex over the last few lines.
3. **Bare letter** single uppercase letter at the end of the message.
The function returns the parsed letter (uppercased) plus a discriminator
of which strategy fired so the runner / report can flag suspicious
parses (typically zero-confidence parses indicate the model didn't
follow the prompt).
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from typing import Literal
ParserStrategy = Literal["json_envelope", "answer_line", "bare_letter", "none"]
@dataclass(frozen=True)
class AnswerLetterResult:
letter: str | None
strategy: ParserStrategy
@property
def found(self) -> bool:
return self.letter is not None
# ---------------------------------------------------------------------------
# Strategies
# ---------------------------------------------------------------------------
_JSON_BLOCK = re.compile(r"\{[^{}]*\"answer_choice\"\s*:\s*\"([A-Za-z])\"[^{}]*\}", re.DOTALL)
_FENCED_JSON = re.compile(r"```(?:json)?\s*(\{.*?\})\s*```", re.DOTALL | re.IGNORECASE)
_ANSWER_LINE = re.compile(
r"(?:final\s*answer|answer\s*choice|the\s+correct\s+answer\s+is|answer)\s*[:=\-]?\s*"
r"\(?\s*([A-Za-z])\s*[\)\.]*\s*$",
re.IGNORECASE | re.MULTILINE,
)
_BARE_LETTER = re.compile(r"^\s*\(?\s*([A-Za-z])\s*[\)\.]*\s*$", re.MULTILINE)
def _from_json_envelope(text: str) -> str | None:
# Try fenced code blocks first (most likely to contain the JSON).
for fence in _FENCED_JSON.finditer(text):
try:
obj = json.loads(fence.group(1))
except (json.JSONDecodeError, ValueError):
continue
if isinstance(obj, dict):
choice = obj.get("answer_choice")
if isinstance(choice, str) and choice.strip():
return choice.strip()[:1].upper()
# Fall back to a tolerant regex over the whole text (handles
# responses that drop the fences).
match = _JSON_BLOCK.search(text)
if match:
return match.group(1).upper()
return None
def _from_answer_line(text: str) -> str | None:
# Walk lines bottom-up; the answer is almost always near the end.
for match in reversed(list(_ANSWER_LINE.finditer(text))):
letter = match.group(1).upper()
if letter.isalpha():
return letter
return None
def _from_bare_letter(text: str) -> str | None:
# Inspect only the final non-empty lines (avoid grabbing in-prose
# mentions of "A" or "I").
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for ln in reversed(lines[-3:]):
match = _BARE_LETTER.match(ln)
if match:
return match.group(1).upper()
return None
def extract_answer_letter(text: str) -> AnswerLetterResult:
"""Run strategies in order and return the first hit.
Order: JSON envelope final-answer-line regex bare-letter
fallback. Empty / whitespace-only text returns
``AnswerLetterResult(None, "none")``.
"""
if not text or not text.strip():
return AnswerLetterResult(None, "none")
letter = _from_json_envelope(text)
if letter:
return AnswerLetterResult(letter, "json_envelope")
letter = _from_answer_line(text)
if letter:
return AnswerLetterResult(letter, "answer_line")
letter = _from_bare_letter(text)
if letter:
return AnswerLetterResult(letter, "bare_letter")
return AnswerLetterResult(None, "none")
__all__ = ["AnswerLetterResult", "ParserStrategy", "extract_answer_letter"]

View file

@ -0,0 +1,110 @@
"""Python port of the canonical citation parser.
Source of truth: ``surfsense_web/lib/citations/citation-parser.ts:20-21``.
The pattern is byte-for-byte identical to the TS export ``CITATION_REGEX``
so a SurfSense user reading the web client and a CUREv1 retrieval scorer
running here see the same chunk_ids extracted from the same answer.
The TS reference also handles a ``urlcite{N}`` placeholder produced by
``preprocessCitationMarkdown`` that pre-processing step is web-only
(GFM autolink workaround), so the harness sees raw ``[citation:URL]``
tokens and ``parse_citations`` returns them as ``UrlCitation`` directly.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Any, Union
# Pattern preserves the TS source verbatim:
# /[\[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g
#
# Notes:
# * Matches both ASCII ``[]`` and Chinese fullwidth ``【】`` brackets.
# * Allows an optional ZWSP (``\u200B``) just inside each bracket.
# * ``citation:`` then EITHER a URL (anything not ``]``, ``】``, or ZWSP),
# OR a ``urlcite\d+`` placeholder, OR one or more comma-separated
# chunk ids (each optionally prefixed with ``doc-`` and optionally
# negative).
# * URL char class deliberately excludes the closing brackets so a
# ``[citation:https://x.com]`` doesn't swallow the ``]``.
# The ZWSP must be the actual code-point — the original TS source uses
# the regex literal ``\u200B`` which the JS engine interprets as the
# character. Python's ``re`` doesn't process the ``\u`` escape inside
# the pattern source, so we splice the literal character in via an
# f-string. This keeps our pattern functionally identical to the TS
# reference and lets ``"\u200B" in CITATION_REGEX.pattern`` succeed.
_ZWSP = "\u200B"
CITATION_REGEX = re.compile(
rf"[\[【]{_ZWSP}?citation:\s*("
rf"https?://[^\]】{_ZWSP}]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*"
rf")\s*{_ZWSP}?[\]】]"
)
@dataclass(frozen=True)
class ChunkCitation:
chunk_id: int
is_docs_chunk: bool
def to_dict(self) -> dict[str, Any]:
return {
"kind": "chunk",
"chunk_id": self.chunk_id,
"is_docs_chunk": self.is_docs_chunk,
}
@dataclass(frozen=True)
class UrlCitation:
url: str
def to_dict(self) -> dict[str, Any]:
return {"kind": "url", "url": self.url}
CitationToken = Union[ChunkCitation, UrlCitation]
def parse_citations(text: str, *, url_map: dict[str, str] | None = None) -> list[CitationToken]:
"""Return the citation tokens found in ``text`` in document order.
``url_map`` is the optional ``urlciteN -> URL`` lookup that the web
client builds in its preprocessing step. The harness ordinarily
doesn't preprocess (we don't render the markdown, we score it), so
the default empty map means ``urlciteN`` placeholders are dropped
rather than mis-resolved to a missing URL.
Multi-id payloads like ``[citation:1, doc-2, -3]`` are flattened
into separate ``ChunkCitation`` entries same as the TS reference.
"""
out: list[CitationToken] = []
for match in CITATION_REGEX.finditer(text):
captured = match.group(1)
if captured.startswith("http://") or captured.startswith("https://"):
out.append(UrlCitation(url=captured.strip()))
continue
if captured.startswith("urlcite"):
if url_map and captured in url_map:
out.append(UrlCitation(url=url_map[captured]))
continue
for raw_id in (s.strip() for s in captured.split(",")):
is_docs_chunk = raw_id.startswith("doc-")
number_part = raw_id[4:] if is_docs_chunk else raw_id
try:
chunk_id = int(number_part)
except ValueError:
continue
out.append(ChunkCitation(chunk_id=chunk_id, is_docs_chunk=is_docs_chunk))
return out
__all__ = [
"CITATION_REGEX",
"ChunkCitation",
"UrlCitation",
"CitationToken",
"parse_citations",
]

View file

@ -0,0 +1,85 @@
"""Extract free-form answers from open-ended LLM responses.
Used by benchmarks that don't have a fixed letter set (MMLongBench-Doc,
DocVQA-style benchmarks, future legal/finance suites). The contract:
* Strip leading "Answer:" / "Final answer:" markers if present.
* Drop fenced code blocks if the model wrapped its answer in one.
* Trim leading/trailing whitespace.
* Return the *last* meaningful chunk models often think out loud
before stating the answer.
If the message is empty or only contains a fence, return ``""``.
"""
from __future__ import annotations
import re
_ANSWER_PREFIX = re.compile(
r"^\s*(?:final\s*answer|the\s+answer\s+is|answer)\s*[:=\-]\s*",
re.IGNORECASE,
)
# Marker-only regex (no capture group) used to find every "Answer:"
# token position. We then slice from the LAST marker's end to the
# next newline ourselves — robust to multiple inline answers because
# we never let the engine greedy-capture across markers.
_ANSWER_MARKER = re.compile(
r"(?:final\s*answer|the\s+answer\s+is|answer)\s*[:=\-]\s*",
re.IGNORECASE,
)
_FENCED_BLOCK = re.compile(r"```[a-zA-Z0-9]*\s*([\s\S]*?)\s*```")
def extract_freeform_answer(text: str) -> str:
"""Pull the model's final answer out of a possibly-verbose response."""
if not text or not text.strip():
return ""
# 1. Find the last line that starts with an Answer: marker. If
# nothing matches, walk back to the last non-empty line.
lines = [ln.rstrip() for ln in text.strip().splitlines()]
candidate = ""
for ln in reversed(lines):
if not ln.strip():
continue
if _ANSWER_PREFIX.search(ln):
candidate = _ANSWER_PREFIX.sub("", ln, count=1).strip()
break
if not candidate:
# 2. Inline match: find every "Answer:" marker position and
# slice from the LAST marker's end to the next newline. Robust
# to "preamble.Answer: 42" one-liners and multiple inline
# markers (we always pick the final, freshest one).
marker_matches = list(_ANSWER_MARKER.finditer(text))
if marker_matches:
last = marker_matches[-1]
tail = text[last.end():]
nl = tail.find("\n")
if nl >= 0:
tail = tail[:nl]
candidate = tail.strip()
if not candidate:
# 3. No "Answer:" marker — try fenced blocks.
fences = _FENCED_BLOCK.findall(text)
if fences:
candidate = fences[-1].strip()
else:
# Last non-empty line as a fallback.
for ln in reversed(lines):
if ln.strip():
candidate = ln.strip()
break
# 2. Strip wrapping quotes / parens / trailing punctuation that
# confuse the grader without changing meaning.
candidate = candidate.strip().strip("`").strip()
if candidate.startswith(("\"", "'")) and candidate.endswith(("\"", "'")):
candidate = candidate[1:-1].strip()
return candidate
__all__ = ["extract_freeform_answer"]

View file

@ -0,0 +1,72 @@
"""Minimal SSE consumer compatible with SurfSense's wire format.
SurfSense uses ``app/services/streaming/envelope/sse.py`` to frame events:
* ``data: <single-line-string>\\n\\n``
* ``data: <json-string>\\n\\n`` (most events)
* ``data: [DONE]\\n\\n`` (terminator)
There is no ``event:``, ``id:``, or ``retry:`` framing in production
``format_sse(payload)`` only emits the ``data:`` line. This implementation
is therefore intentionally smaller than ``httpx-sse`` (which we still
list as a dep so callers who want richer parsing can opt in): one event
per ``data:`` line, separated by blank lines.
We accept any line iterator (an ``httpx.Response.aiter_lines`` adapter
in production, a list in tests) so this is unit-testable without a
network mock.
"""
from __future__ import annotations
from collections.abc import AsyncIterator
from dataclasses import dataclass
@dataclass(frozen=True)
class SseEvent:
"""A parsed SSE event. Only the ``data`` field is populated.
Multi-line payloads (``data: a\\ndata: b``) are joined with ``\\n``
per the SSE spec, even though SurfSense doesn't currently emit them.
"""
data: str
async def iter_sse_events(lines: AsyncIterator[str]) -> AsyncIterator[SseEvent]:
"""Yield one ``SseEvent`` per blank-line-terminated frame.
Lines that are empty or whitespace flush the buffer. ``data:`` lines
are accumulated into the buffer; everything else is ignored
(matches the lenient browser EventSource behaviour).
"""
buffer: list[str] = []
async for raw in lines:
if raw is None:
continue
line = raw.rstrip("\r")
if line == "":
if buffer:
yield SseEvent(data="\n".join(buffer))
buffer.clear()
continue
if line.startswith(":"):
# comment / heartbeat
continue
if line.startswith("data:"):
# spec: optional single space after the colon.
payload = line[5:]
if payload.startswith(" "):
payload = payload[1:]
buffer.append(payload)
continue
# Any other field (event:, id:, retry:) is currently unused.
continue
if buffer:
yield SseEvent(data="\n".join(buffer))
__all__ = ["SseEvent", "iter_sse_events"]

View file

@ -0,0 +1,35 @@
"""Direct parser invocations for the parser_compare benchmark.
The SurfSense backend exposes a single ``ETL_SERVICE`` env var that
picks one parser globally; per-ingestion overrides are not on the
public API. To drive the four (Azure DI x basic/premium, LlamaCloud x
basic/premium) extractions we need for ``multimodal_doc/parser_compare``
we therefore call the Azure DI and LlamaCloud SDKs directly from the
eval harness, mirroring the production code path in
``surfsense_backend/app/etl_pipeline/parsers/``.
Two design rules:
* No backend imports the eval harness cannot pull in the FastAPI
app's config layer (it would require the full backend ``.env`` plus a
reachable Postgres). We re-read keys from our own environment instead.
* Same wire shape as the backend's parsers (Azure ``prebuilt-read`` /
``prebuilt-layout`` selected by ``processing_mode``; LlamaCloud
``parse_page_with_llm`` / ``parse_page_with_agent`` selected by
``processing_mode``) so any quality conclusions transfer back to
production behaviour.
"""
from __future__ import annotations
from .azure_di import AzureDIError, parse_with_azure_di
from .llamacloud import LlamaCloudError, parse_with_llamacloud
from .pdf_pages import count_pdf_pages
__all__ = [
"AzureDIError",
"LlamaCloudError",
"count_pdf_pages",
"parse_with_azure_di",
"parse_with_llamacloud",
]

View file

@ -0,0 +1,144 @@
"""Azure Document Intelligence parser — eval-side mirror of the backend.
Calls ``DocumentIntelligenceClient.begin_analyze_document`` with one
of two ``model_id`` slugs depending on ``processing_mode``:
* ``basic`` ``prebuilt-read`` (text OCR only, cheaper, faster)
* ``premium`` ``prebuilt-layout`` (text + tables + structure;
produces real markdown headings,
pipe-tables, etc.)
These are the same model selections the production
``surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py``
makes per ``processing_mode``. Output format is forced to Markdown
(``DocumentContentFormat.MARKDOWN``) so the long-context arm can stuff
it into a prompt verbatim.
Retry policy is intentionally light here (the eval harness re-runs
the whole batch on top-level failure); we do one synchronous attempt
plus exponential backoff on transient transport errors.
"""
from __future__ import annotations
import asyncio
import logging
import os
import random
logger = logging.getLogger(__name__)
_AZURE_MODEL_BY_MODE = {
"basic": "prebuilt-read",
"premium": "prebuilt-layout",
}
_MAX_RETRIES = 4
_BASE_DELAY = 5.0
_MAX_DELAY = 60.0
class AzureDIError(RuntimeError):
"""Raised when Azure DI fails after all retries."""
async def parse_with_azure_di(
file_path: str | os.PathLike,
*,
processing_mode: str = "basic",
endpoint: str | None = None,
api_key: str | None = None,
) -> str:
"""Run Azure DI on ``file_path`` and return the markdown content.
``endpoint`` / ``api_key`` default to ``AZURE_DI_ENDPOINT`` and
``AZURE_DI_KEY`` env vars (set in ``surfsense_evals/.env``).
Raises ``AzureDIError`` after exhausting retries; ``ValueError`` if
credentials are missing.
"""
endpoint = endpoint or os.environ.get("AZURE_DI_ENDPOINT")
api_key = api_key or os.environ.get("AZURE_DI_KEY")
if not endpoint or not api_key:
raise ValueError(
"AZURE_DI_ENDPOINT and AZURE_DI_KEY must be set "
"(see surfsense_evals/.env)."
)
model_id = _AZURE_MODEL_BY_MODE.get(processing_mode, "prebuilt-read")
# Lazy imports — surfsense_evals shouldn't pay the azure-sdk
# import cost on every CLI invocation that doesn't touch
# parser_compare.
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import (
ClientAuthenticationError,
HttpResponseError,
ServiceRequestError,
ServiceResponseError,
)
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
logger.info(
"Azure DI parsing %s (mode=%s, model=%s, size=%.1fMB)",
file_path, processing_mode, model_id, file_size_mb,
)
last_exc: Exception | None = None
for attempt in range(1, _MAX_RETRIES + 1):
try:
client = DocumentIntelligenceClient(
endpoint=endpoint,
credential=AzureKeyCredential(api_key),
)
async with client:
with open(file_path, "rb") as fh:
poller = await client.begin_analyze_document(
model_id,
body=fh,
output_content_format=DocumentContentFormat.MARKDOWN,
)
result = await poller.result()
content = (result.content or "").strip()
if not content:
raise AzureDIError(
f"Azure DI returned empty content for {file_path}"
)
logger.info(
"Azure DI OK: %s (%s) -> %d chars",
file_path, model_id, len(content),
)
return content
except ClientAuthenticationError:
raise
except HttpResponseError as exc:
# 4xx that's not auth: don't retry, the request itself is broken.
if exc.status_code and 400 <= exc.status_code < 500:
raise AzureDIError(
f"Azure DI {exc.status_code} on {file_path}: {exc}"
) from exc
last_exc = exc
except (ServiceRequestError, ServiceResponseError) as exc:
last_exc = exc
if attempt < _MAX_RETRIES:
delay = min(_BASE_DELAY * (2 ** (attempt - 1)), _MAX_DELAY)
jitter = delay * 0.25 * (2 * random.random() - 1)
sleep_for = delay + jitter
logger.warning(
"Azure DI attempt %d/%d failed (%s); retrying in %.1fs",
attempt, _MAX_RETRIES, type(last_exc).__name__, sleep_for,
)
await asyncio.sleep(sleep_for)
raise AzureDIError(
f"Azure DI failed after {_MAX_RETRIES} attempts on {file_path}"
) from last_exc
__all__ = ["AzureDIError", "parse_with_azure_di"]

View file

@ -0,0 +1,168 @@
"""LlamaParse (LlamaCloud) parser — eval-side mirror of the backend.
Calls ``LlamaParse.aparse`` with one of two ``parse_mode`` slugs
depending on ``processing_mode``:
* ``basic`` ``parse_page_with_llm`` (cheap, single-LLM-call/page)
* ``premium`` ``parse_page_with_agent`` (multi-step agent per page;
handles tables / figures
substantially better)
These are the exact mappings from production
``surfsense_backend/app/etl_pipeline/parsers/llamacloud.py``. We keep
``num_workers=1`` and language=``"en"`` to match production.
The result is materialised via ``get_markdown_documents(split_by_page=False)``
which concatenates every page into a single markdown string, exactly
the shape we need for long-context stuffing.
"""
from __future__ import annotations
import asyncio
import logging
import os
import random
import httpx
logger = logging.getLogger(__name__)
_LLAMA_PARSE_MODE_MAP = {
"basic": "parse_page_with_llm",
"premium": "parse_page_with_agent",
}
_MAX_RETRIES = 3
_BASE_DELAY = 10.0
_MAX_DELAY = 90.0
class LlamaCloudError(RuntimeError):
"""Raised when LlamaCloud parse fails after all retries."""
def _extract_markdown(result) -> str:
"""Pull markdown out of whatever object LlamaParse.aparse returns.
Mirrors backend's tolerant extraction: the SDK has gone through
several response shapes; we accept all of them so a minor SDK bump
doesn't silently zero the eval.
"""
if hasattr(result, "get_markdown_documents"):
docs = result.get_markdown_documents(split_by_page=False)
if docs and hasattr(docs[0], "text"):
return docs[0].text
if hasattr(result, "pages") and result.pages:
return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
if isinstance(result, list):
if result and hasattr(result[0], "text"):
return result[0].text
return "\n\n".join(
doc.page_content if hasattr(doc, "page_content") else str(doc)
for doc in result
)
return str(result)
async def parse_with_llamacloud(
file_path: str | os.PathLike,
*,
processing_mode: str = "basic",
estimated_pages: int = 50,
api_key: str | None = None,
) -> str:
"""Run LlamaParse on ``file_path`` and return the markdown content.
``api_key`` defaults to the ``LLAMA_CLOUD_API_KEY`` env var (set
in ``surfsense_evals/.env``).
Raises ``LlamaCloudError`` after exhausting retries; ``ValueError``
if the API key is missing.
"""
api_key = api_key or os.environ.get("LLAMA_CLOUD_API_KEY")
if not api_key:
raise ValueError(
"LLAMA_CLOUD_API_KEY must be set (see surfsense_evals/.env)."
)
parse_mode = _LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
# Lazy import: llama-cloud pulls llama-index-core (~50 MB) on first
# touch; defer until the parser actually runs.
from llama_cloud_services import LlamaParse
from llama_cloud_services.parse.base import JobFailedException
from llama_cloud_services.parse.utils import ResultType
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
# Match backend's per-page timeout heuristic so big PDFs don't drop
# mid-job: 60s baseline + 30s/page (premium agent runs longer than
# basic; both fit comfortably here).
job_timeout = max(180.0, 60.0 + 30.0 * estimated_pages)
upload_timeout = max(120.0, 30.0 * file_size_mb)
logger.info(
"LlamaCloud parsing %s (mode=%s, parse_mode=%s, %.1fMB, "
"job_timeout=%.0fs)",
file_path, processing_mode, parse_mode, file_size_mb, job_timeout,
)
custom_timeout = httpx.Timeout(
connect=120.0, read=upload_timeout, write=upload_timeout, pool=120.0,
)
last_exc: Exception | None = None
for attempt in range(1, _MAX_RETRIES + 1):
try:
async with httpx.AsyncClient(timeout=custom_timeout) as client:
parser = LlamaParse(
api_key=api_key,
num_workers=1,
verbose=False,
language="en",
result_type=ResultType.MD,
parse_mode=parse_mode,
ignore_errors=False,
max_timeout=int(max(2000.0, job_timeout + upload_timeout)),
job_timeout_in_seconds=job_timeout,
job_timeout_extra_time_per_page_in_seconds=60,
custom_client=client,
)
result = await parser.aparse(str(file_path))
content = _extract_markdown(result).strip()
if not content:
raise LlamaCloudError(
f"LlamaCloud returned empty content for {file_path}"
)
logger.info(
"LlamaCloud OK: %s (%s) -> %d chars",
file_path, parse_mode, len(content),
)
return content
except (
httpx.HTTPError,
JobFailedException,
RuntimeError,
) as exc:
last_exc = exc
if attempt < _MAX_RETRIES:
delay = min(_BASE_DELAY * (2 ** (attempt - 1)), _MAX_DELAY)
jitter = delay * 0.25 * (2 * random.random() - 1)
sleep_for = delay + jitter
logger.warning(
"LlamaCloud attempt %d/%d failed (%s); retrying in %.1fs",
attempt, _MAX_RETRIES, type(last_exc).__name__, sleep_for,
)
await asyncio.sleep(sleep_for)
raise LlamaCloudError(
f"LlamaCloud failed after {_MAX_RETRIES} attempts on {file_path}"
) from last_exc
__all__ = ["LlamaCloudError", "parse_with_llamacloud"]

View file

@ -0,0 +1,35 @@
"""Tiny pypdf wrapper for "how many pages does this PDF have?".
Used by ``parser_compare`` to:
* Decide LlamaCloud's per-page job timeout.
* Compute the SurfSense preprocessing dollar cost
(``$1 / 1k pages`` for basic, ``$10 / 1k pages`` for premium) so the
report can show "ingest + LLM" total cost per arm.
Returns ``0`` (and logs) on parse failure rather than raising costs
shown as ``?`` are always better than a benchmark that crashes mid-run.
"""
from __future__ import annotations
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
def count_pdf_pages(path: Path) -> int:
"""Return the page count for ``path``; ``0`` if pypdf can't open it."""
try:
from pypdf import PdfReader
reader = PdfReader(str(path))
return len(reader.pages)
except Exception as exc: # noqa: BLE001
logger.warning("Failed to count pages for %s: %s", path, exc)
return 0
__all__ = ["count_pdf_pages"]

View file

@ -0,0 +1,31 @@
"""Domain-agnostic PDF rendering helper. Lazy import."""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING: # pragma: no cover
from .render import (
PdfImage,
render_pdf,
render_pdf_with_images,
render_text_files_to_pdf,
)
__all__ = [
"PdfImage",
"render_pdf",
"render_pdf_with_images",
"render_text_files_to_pdf",
]
_LAZY = {"PdfImage", "render_pdf", "render_pdf_with_images", "render_text_files_to_pdf"}
def __getattr__(name: str):
if name in _LAZY:
from . import render as _mod
return getattr(_mod, name)
raise AttributeError(f"module 'surfsense_evals.core.pdf' has no attribute {name!r}")

View file

@ -0,0 +1,351 @@
"""Deterministic ``.txt`` / ``.md`` → single PDF via reportlab.
Used wherever a benchmark needs the same source bytes fed to both the
native-PDF arm and the SurfSense ingestion arm. The head-to-head
comparison is fair only if the *same* PDF is the input to both arms,
which is why we go to lengths to make the rendering deterministic.
Determinism notes:
* We pin the PDF metadata to a fixed creation date and producer
(``reportlab`` accepts neither directly, but ``Canvas.setAuthor`` and
the absence of an ``info`` mutator means the bytes only differ by
``CreationDate`` / ``ModDate``). We post-process the PDF to scrub
those if ``deterministic=True`` is passed.
* Page size, font, margins, and tab handling are fixed in code so the
same input yields the same byte output across machines.
* PDF/A is overkill for our use; basic PDF 1.4 is what every model
expects.
"""
from __future__ import annotations
import io
import re
from collections.abc import Iterable, Sequence
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
from reportlab.lib.pagesizes import LETTER
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.units import inch
from reportlab.lib.utils import ImageReader
from reportlab.platypus import (
Image,
KeepTogether,
PageBreak,
Paragraph,
SimpleDocTemplate,
Spacer,
)
@dataclass
class RenderedPdf:
path: Path
n_pages_estimate: int
n_chars: int
_PDF_DATE_KEY = re.compile(rb"/(?:CreationDate|ModDate)\s*\(D:[^)]*\)")
# reportlab also writes a `/ID [<hex1><hex2>]` trailer entry that
# embeds a per-run hash. Scrub it so two renders of the same input
# produce the same bytes.
_PDF_ID_ARRAY = re.compile(rb"/ID\s*\[\s*<[^>]*>\s*<[^>]*>\s*\]")
def _scrub_dates(pdf_bytes: bytes) -> bytes:
"""Remove ``CreationDate`` / ``ModDate`` / trailer ``/ID`` so the
file is byte-deterministic across runs."""
pdf_bytes = _PDF_DATE_KEY.sub(b"/CreationDate (D:19700101000000Z)", pdf_bytes)
pdf_bytes = _PDF_ID_ARRAY.sub(b"/ID [<00><00>]", pdf_bytes)
return pdf_bytes
_DEFAULT_STYLES = getSampleStyleSheet()
def _build_body_style() -> ParagraphStyle:
base = _DEFAULT_STYLES["BodyText"]
style = ParagraphStyle(
"EvalBody",
parent=base,
fontName="Helvetica",
fontSize=10.5,
leading=14,
spaceAfter=6,
spaceBefore=0,
)
return style
def _build_heading_style() -> ParagraphStyle:
base = _DEFAULT_STYLES["Heading2"]
style = ParagraphStyle(
"EvalHeading",
parent=base,
fontName="Helvetica-Bold",
fontSize=14,
leading=18,
spaceAfter=10,
spaceBefore=8,
)
return style
def _normalise_paragraphs(text: str) -> list[str]:
"""Split a text blob into paragraphs while preserving blank-line structure."""
blocks: list[list[str]] = [[]]
for line in text.splitlines():
stripped = line.rstrip()
if stripped == "":
if blocks[-1]:
blocks.append([])
continue
blocks[-1].append(stripped)
paragraphs: list[str] = []
for block in blocks:
if not block:
continue
# Join lines within a paragraph with spaces (text-from-PDF style).
paragraphs.append(" ".join(block))
return paragraphs
def _escape_html(text: str) -> str:
return (
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
def render_pdf(
*,
title: str,
sections: Sequence[tuple[str | None, str]],
output_path: Path,
deterministic: bool = True,
) -> RenderedPdf:
"""Render one PDF from a list of ``(section_heading, section_text)`` tuples.
``section_heading`` may be ``None`` for an unnamed section. Each
section is followed by a page break so the model's PDF parser sees
a clean structural boundary between source files.
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
buffer = io.BytesIO()
doc = SimpleDocTemplate(
buffer,
pagesize=LETTER,
leftMargin=0.75 * inch,
rightMargin=0.75 * inch,
topMargin=0.75 * inch,
bottomMargin=0.75 * inch,
title=title,
author="surfsense-evals",
subject="Eval input",
creator="surfsense-evals",
)
body_style = _build_body_style()
heading_style = _build_heading_style()
title_style = ParagraphStyle(
"EvalTitle",
parent=_DEFAULT_STYLES["Title"],
fontName="Helvetica-Bold",
fontSize=18,
leading=22,
spaceAfter=14,
)
flow: list = [Paragraph(_escape_html(title), title_style)]
total_chars = 0
for index, (heading, text) in enumerate(sections):
if index > 0:
flow.append(PageBreak())
if heading:
flow.append(Paragraph(_escape_html(heading), heading_style))
for paragraph in _normalise_paragraphs(text):
total_chars += len(paragraph)
flow.append(Paragraph(_escape_html(paragraph), body_style))
flow.append(Spacer(1, 4))
doc.build(flow)
pdf_bytes = buffer.getvalue()
if deterministic:
pdf_bytes = _scrub_dates(pdf_bytes)
output_path.write_bytes(pdf_bytes)
# Conservative page estimate: ~3000 chars per LETTER page at 10.5pt.
n_pages = max(1, total_chars // 3000 + len(sections))
return RenderedPdf(path=output_path, n_pages_estimate=n_pages, n_chars=total_chars)
@dataclass
class PdfImage:
"""One image to embed inside a section.
``caption`` is rendered below the image (italic). ``max_width_in``
caps the rendered width in inches; height auto-scales to preserve
aspect ratio (read with PIL).
"""
path: Path
caption: str = ""
max_width_in: float = 5.5 # default leaves margin for LETTER 8.5"
def _make_image_flowable(image: PdfImage) -> Image:
"""Build a reportlab Image flowable scaled to fit page width."""
reader = ImageReader(str(image.path))
iw, ih = reader.getSize()
if iw <= 0 or ih <= 0:
raise ValueError(f"Invalid image dimensions for {image.path}: {iw}x{ih}")
target_w = image.max_width_in * inch
target_h = target_w * (ih / iw)
# Cap height too — some medical images are extreme portrait.
max_h = 7.0 * inch
if target_h > max_h:
target_h = max_h
target_w = target_h * (iw / ih)
return Image(str(image.path), width=target_w, height=target_h)
def render_pdf_with_images(
*,
title: str,
sections: Sequence[tuple[str | None, str, Sequence[PdfImage] | None]],
output_path: Path,
deterministic: bool = True,
page_break_between_sections: bool = False,
) -> RenderedPdf:
"""Render a PDF that mixes text and embedded images.
Each section is ``(heading, body_text, images)``. Images render
inline after the body text, each followed by an italic caption.
Set ``page_break_between_sections=True`` if you want explicit
structural boundaries (mostly useful for multi-case PDFs); the
default keeps everything on one page when possible (so a single
MedXpertQA case is one PDF page with case + images + options).
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
buffer = io.BytesIO()
doc = SimpleDocTemplate(
buffer,
pagesize=LETTER,
leftMargin=0.75 * inch,
rightMargin=0.75 * inch,
topMargin=0.75 * inch,
bottomMargin=0.75 * inch,
title=title,
author="surfsense-evals",
subject="Eval input",
creator="surfsense-evals",
)
body_style = _build_body_style()
heading_style = _build_heading_style()
caption_style = ParagraphStyle(
"EvalCaption",
parent=body_style,
fontSize=9,
leading=11,
textColor="#444",
spaceBefore=2,
spaceAfter=10,
)
title_style = ParagraphStyle(
"EvalTitle",
parent=_DEFAULT_STYLES["Title"],
fontName="Helvetica-Bold",
fontSize=18,
leading=22,
spaceAfter=14,
)
flow: list = [Paragraph(_escape_html(title), title_style)]
total_chars = 0
for index, (heading, text, images) in enumerate(sections):
if index > 0 and page_break_between_sections:
flow.append(PageBreak())
if heading:
flow.append(Paragraph(_escape_html(heading), heading_style))
for paragraph in _normalise_paragraphs(text):
total_chars += len(paragraph)
flow.append(Paragraph(_escape_html(paragraph), body_style))
flow.append(Spacer(1, 4))
for image in images or []:
try:
img_flow = _make_image_flowable(image)
except Exception: # noqa: BLE001 — bad image shouldn't kill PDF
continue
grouped = [img_flow]
if image.caption:
grouped.append(Paragraph(_escape_html(image.caption), caption_style))
else:
grouped.append(Spacer(1, 8))
flow.append(KeepTogether(grouped))
doc.build(flow)
pdf_bytes = buffer.getvalue()
if deterministic:
pdf_bytes = _scrub_dates(pdf_bytes)
output_path.write_bytes(pdf_bytes)
n_pages = max(1, total_chars // 3000 + len(sections))
return RenderedPdf(path=output_path, n_pages_estimate=n_pages, n_chars=total_chars)
def render_text_files_to_pdf(
*,
title: str,
files: Iterable[Path],
output_path: Path,
deterministic: bool = True,
) -> RenderedPdf:
"""Convenience wrapper: read a list of text files, render to one PDF.
The heading of each section is the file's name (no extension), so
e.g. ``admission_note.txt`` becomes a section header ``admission_note``
in the rendered PDF. Useful for any text-only benchmark that ships
a corpus as separate ``.txt`` / ``.md`` shards per logical document.
"""
sections: list[tuple[str | None, str]] = []
for path in files:
path = Path(path)
text = path.read_text(encoding="utf-8")
sections.append((path.stem, text))
return render_pdf(
title=title,
sections=sections,
output_path=output_path,
deterministic=deterministic,
)
# Tiny self-check — handy when debugging.
def _self_test() -> None: # pragma: no cover
out = Path("./_render_self_test.pdf")
sections = [
("intro", "Hello world.\n\nThis is a test."),
("body", "Line one.\nLine two."),
]
rendered = render_pdf(title="Self test", sections=sections, output_path=out)
print(f"wrote {rendered.path} ({rendered.n_chars} chars)")
# Importing ``datetime`` keeps the timezone helper handy if a future
# benchmark wants to embed a real timestamp without losing determinism.
_NOW_FROZEN = datetime(2026, 5, 11, tzinfo=UTC)

View file

@ -0,0 +1,22 @@
"""External LLM providers (used by the native arm).
Lazy imports so the SurfSense-only path doesn't transitively load the
OpenRouter client until something actually constructs ``OpenRouterPdfProvider``.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING: # pragma: no cover
from .openrouter_pdf import OpenRouterPdfProvider, OpenRouterResponse
__all__ = ["OpenRouterPdfProvider", "OpenRouterResponse"]
def __getattr__(name: str):
if name in {"OpenRouterPdfProvider", "OpenRouterResponse"}:
from . import openrouter_pdf as _mod
return getattr(_mod, name)
raise AttributeError(f"module 'surfsense_evals.core.providers' has no attribute {name!r}")

View file

@ -0,0 +1,118 @@
"""Bare OpenRouter ``chat/completions`` provider — no PDF, no plugins.
Used by ``BareLlmArm`` to measure "what does the model answer with
zero retrieval context?". Same wire shape as ``OpenRouterPdfProvider``
minus the file-parser plugin and the ``file`` content part:
```json
{
"model": "openai/gpt-5.4-mini",
"messages": [
{"role": "system", "content": "<optional>"},
{"role": "user", "content": "<prompt>"}
]
}
```
The response shape is identical to the PDF provider's, so we re-use
``_parse_chat_completion`` from ``openrouter_pdf`` and only specialise
the request builder. That keeps cost-extraction, token-counting, and
content-array handling in one place.
"""
from __future__ import annotations
import logging
import time
from typing import Any
import httpx
from .openrouter_pdf import (
OpenRouterResponse,
_DEFAULT_HEADERS,
_parse_chat_completion,
)
logger = logging.getLogger(__name__)
class OpenRouterChatProvider:
"""Stateless bare-chat client. No PDF, no file-parser plugin."""
def __init__(
self,
*,
api_key: str,
base_url: str = "https://openrouter.ai/api/v1",
model: str,
timeout_s: float = 600.0,
) -> None:
if not api_key:
raise ValueError("OPENROUTER_API_KEY is required for the bare-LLM arm.")
self._api_key = api_key
self._base = base_url.rstrip("/")
self._model = model
self._timeout = httpx.Timeout(timeout_s, connect=15.0)
@property
def model(self) -> str:
return self._model
def _build_payload(
self,
*,
prompt: str,
system_prompt: str | None,
max_tokens: int | None,
) -> dict[str, Any]:
messages: list[dict[str, Any]] = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
body: dict[str, Any] = {"model": self._model, "messages": messages}
if max_tokens:
body["max_tokens"] = max_tokens
return body
async def complete(
self,
*,
prompt: str,
system_prompt: str | None = None,
max_tokens: int | None = None,
http: httpx.AsyncClient | None = None,
) -> OpenRouterResponse:
"""Single chat completion. Errors are raised verbatim — caller decides retries."""
payload = self._build_payload(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=max_tokens,
)
headers = {
"Authorization": f"Bearer {self._api_key}",
"Content-Type": "application/json",
"Accept": "application/json",
**_DEFAULT_HEADERS,
}
url = f"{self._base}/chat/completions"
started = time.monotonic()
if http is not None:
response = await http.post(url, json=payload, headers=headers, timeout=self._timeout)
else:
async with httpx.AsyncClient(timeout=self._timeout) as client:
response = await client.post(
url, json=payload, headers=headers, timeout=self._timeout
)
latency_ms = int((time.monotonic() - started) * 1000)
if response.status_code >= 400:
raise httpx.HTTPStatusError(
f"OpenRouter HTTP {response.status_code}: {response.text[:300]}",
request=response.request,
response=response,
)
return _parse_chat_completion(response.json(), latency_ms=latency_ms)
__all__ = ["OpenRouterChatProvider"]

View file

@ -0,0 +1,231 @@
"""Native-PDF arm provider: OpenRouter ``chat/completions`` with PDF input.
Per `<https://openrouter.ai/docs/features/multimodal/pdfs>`__ the wire
shape is OpenAI-compatible with one PDF-specific extra:
```json
{
"model": "anthropic/claude-sonnet-4.5",
"messages": [{
"role": "user",
"content": [
{"type": "file", "file": {"filename": "case.pdf",
"file_data": "data:application/pdf;base64,<b64>"}},
{"type": "text", "text": "<prompt>"}
]
}],
"plugins": [{"id": "file-parser", "pdf": {"engine": "native"}}]
}
```
``engine: "native"`` is the only engine that doesn't pre-OCR the
PDF it forwards raw bytes to PDF-native models (Claude, Gemini),
matching what a human user does when "dropping the PDF into Claude".
``mistral-ocr`` and ``cloudflare-ai`` are exposed as enum options for
non-native models.
Headers ``HTTP-Referer`` and ``X-Title`` make spend show up cleanly on
the OpenRouter dashboard.
"""
from __future__ import annotations
import base64
import logging
import time
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Any
import httpx
logger = logging.getLogger(__name__)
class PdfEngine(str, Enum):
NATIVE = "native"
MISTRAL_OCR = "mistral-ocr"
CLOUDFLARE_AI = "cloudflare-ai"
@dataclass
class OpenRouterResponse:
"""Subset of the OpenRouter response we care about for scoring."""
text: str
input_tokens: int
output_tokens: int
total_tokens: int
cost_micros: int
latency_ms: int
finish_reason: str | None
raw: dict[str, Any]
_DEFAULT_HEADERS = {
"HTTP-Referer": "https://github.com/MODSetter/SurfSense",
"X-Title": "SurfSense-evals",
}
class OpenRouterPdfProvider:
"""Thin httpx-based client. Stateless; safe to reuse per arm instance."""
def __init__(
self,
*,
api_key: str,
base_url: str = "https://openrouter.ai/api/v1",
model: str,
engine: PdfEngine = PdfEngine.NATIVE,
timeout_s: float = 600.0,
) -> None:
if not api_key:
raise ValueError("OPENROUTER_API_KEY is required for the native arm.")
self._api_key = api_key
self._base = base_url.rstrip("/")
self._model = model
self._engine = engine
self._timeout = httpx.Timeout(timeout_s, connect=15.0)
@property
def model(self) -> str:
return self._model
@property
def engine(self) -> PdfEngine:
return self._engine
def _build_payload(
self,
*,
prompt: str,
pdf_path: Path,
max_tokens: int | None,
extra_messages: list[dict[str, Any]] | None,
) -> dict[str, Any]:
b64 = base64.b64encode(pdf_path.read_bytes()).decode("ascii")
user_content: list[dict[str, Any]] = [
{
"type": "file",
"file": {
"filename": pdf_path.name,
"file_data": f"data:application/pdf;base64,{b64}",
},
},
{"type": "text", "text": prompt},
]
messages: list[dict[str, Any]] = list(extra_messages or [])
messages.append({"role": "user", "content": user_content})
body: dict[str, Any] = {
"model": self._model,
"messages": messages,
"plugins": [
{"id": "file-parser", "pdf": {"engine": self._engine.value}}
],
}
if max_tokens:
body["max_tokens"] = max_tokens
return body
async def complete(
self,
*,
prompt: str,
pdf_path: Path,
max_tokens: int | None = None,
extra_messages: list[dict[str, Any]] | None = None,
http: httpx.AsyncClient | None = None,
) -> OpenRouterResponse:
"""Single chat completion. Errors are raised verbatim — runner decides retries."""
payload = self._build_payload(
prompt=prompt,
pdf_path=pdf_path,
max_tokens=max_tokens,
extra_messages=extra_messages,
)
headers = {
"Authorization": f"Bearer {self._api_key}",
"Content-Type": "application/json",
"Accept": "application/json",
**_DEFAULT_HEADERS,
}
url = f"{self._base}/chat/completions"
started = time.monotonic()
if http is not None:
response = await http.post(url, json=payload, headers=headers, timeout=self._timeout)
else:
async with httpx.AsyncClient(timeout=self._timeout) as client:
response = await client.post(
url, json=payload, headers=headers, timeout=self._timeout
)
latency_ms = int((time.monotonic() - started) * 1000)
if response.status_code >= 400:
raise httpx.HTTPStatusError(
f"OpenRouter HTTP {response.status_code}: {response.text[:300]}",
request=response.request,
response=response,
)
data = response.json()
return _parse_chat_completion(data, latency_ms=latency_ms)
def _parse_chat_completion(payload: dict[str, Any], *, latency_ms: int) -> OpenRouterResponse:
"""Tolerant parser for OpenRouter / OpenAI chat-completions JSON.
OpenRouter passes through any provider-specific extras, but the
canonical shape is ``choices[0].message.content`` (string OR array
of content parts) and ``usage.prompt_tokens / completion_tokens / total_tokens``.
Cost lives at the top level (``payload["usage"]["cost"]`` or
``payload["x-or-cost"]``) depending on routing.
"""
text = ""
finish_reason: str | None = None
choices = payload.get("choices") or []
if choices:
message = (choices[0] or {}).get("message") or {}
content = message.get("content")
if isinstance(content, str):
text = content
elif isinstance(content, list):
chunks: list[str] = []
for part in content:
if isinstance(part, dict) and part.get("type") in {"text", "output_text"}:
chunks.append(str(part.get("text", "")))
text = "".join(chunks)
finish_reason = (choices[0] or {}).get("finish_reason") or None
usage = payload.get("usage") or {}
input_tokens = int(usage.get("prompt_tokens") or 0)
output_tokens = int(usage.get("completion_tokens") or 0)
total_tokens = int(usage.get("total_tokens") or (input_tokens + output_tokens))
# OpenRouter exposes cost in dollars on `usage.cost` or `cost`. We
# convert to integer micros to avoid float-summing surprises across
# 7,663 MIRAGE questions.
raw_cost = usage.get("cost")
if raw_cost is None:
raw_cost = payload.get("cost")
cost_micros = 0
if raw_cost is not None:
try:
cost_micros = int(round(float(raw_cost) * 1_000_000))
except (TypeError, ValueError):
cost_micros = 0
return OpenRouterResponse(
text=text,
input_tokens=input_tokens,
output_tokens=output_tokens,
total_tokens=total_tokens,
cost_micros=cost_micros,
latency_ms=latency_ms,
finish_reason=finish_reason,
raw=payload,
)
__all__ = ["OpenRouterPdfProvider", "OpenRouterResponse", "PdfEngine"]

View file

@ -0,0 +1,265 @@
"""Suite + Benchmark protocols and the global registry.
The extensibility seam: ``core.cli`` walks ``surfsense_evals.suites`` on
import, which auto-imports every benchmark subpackage, which calls
``register(<benchmark>)`` at module bottom. The CLI then iterates the
populated registry to build subcommand groups dynamically.
Adding a new domain = drop a folder under ``suites/<domain>/<bench>/``
that ends in ``register(MyBenchmark())``. No edits anywhere in
``core/`` are required.
"""
from __future__ import annotations
import argparse
from collections.abc import Mapping
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Protocol, runtime_checkable
import httpx
from .clients import DocumentsClient, NewChatClient, SearchSpaceClient
from .config import Config, SuiteState
# ---------------------------------------------------------------------------
# Run context — what every benchmark.ingest/run receives
# ---------------------------------------------------------------------------
@dataclass
class RunContext:
"""Per-invocation environment threaded into ``ingest`` and ``run``.
A benchmark uses this to read pinned suite state, build new HTTP
clients on the shared ``http`` session, find the right data /
reports paths, and discover the active OpenRouter model + key.
``http`` is the authenticated SurfSense client (auth event hook
attached). It is **not** an OpenRouter client providers create
their own short-lived clients because OpenRouter doesn't share the
SurfSense bearer.
"""
suite: str
benchmark: str
config: Config
suite_state: SuiteState
http: httpx.AsyncClient
@property
def search_space_id(self) -> int:
return self.suite_state.search_space_id
@property
def agent_llm_id(self) -> int:
return self.suite_state.agent_llm_id
@property
def provider_model(self) -> str:
"""Slug used by the SurfSense agent (and the native arm by default).
For ``cost-arbitrage`` scenarios this is the *cheap, text-only*
slug SurfSense answers from the chunks the vision LLM already
extracted at ingest. The native arm should use
``native_arm_model`` instead in that scenario.
"""
return self.suite_state.provider_model
@property
def native_arm_model(self) -> str:
"""Slug the native_pdf arm should use.
Defaults to ``provider_model`` (head-to-head / symmetric-cheap);
for ``cost-arbitrage`` it returns the explicit
``--native-arm-model`` so the native arm can fairly answer
image-bearing questions.
"""
return self.suite_state.effective_native_arm_model
@property
def vision_provider_model(self) -> str | None:
"""Slug of the OpenRouter vision LLM SurfSense used at ingest.
``None`` if no vision config was attached at setup (legacy or
text-only suite). Used by runners purely to record what was
actually used in ``RunArtifact.extra`` and to label reports.
"""
return self.suite_state.vision_provider_model
@property
def scenario(self) -> str:
"""Scenario name pinned at setup time (see ``config.SCENARIOS``)."""
return self.suite_state.scenario
def search_space_client(self) -> SearchSpaceClient:
return SearchSpaceClient(self.http, self.config.surfsense_api_base)
def documents_client(self) -> DocumentsClient:
return DocumentsClient(self.http, self.config.surfsense_api_base)
def new_chat_client(self) -> NewChatClient:
return NewChatClient(self.http, self.config.surfsense_api_base)
def maps_dir(self) -> Path:
path = self.config.suite_maps_dir(self.suite)
path.mkdir(parents=True, exist_ok=True)
return path
def runs_dir(self, *, run_timestamp: str) -> Path:
path = self.config.suite_runs_dir(self.suite) / run_timestamp / self.benchmark
path.mkdir(parents=True, exist_ok=True)
return path
def benchmark_data_dir(self) -> Path:
path = self.config.suite_data_dir(self.suite) / self.benchmark
path.mkdir(parents=True, exist_ok=True)
return path
# ---------------------------------------------------------------------------
# Run artifact + report section
# ---------------------------------------------------------------------------
@dataclass
class RunArtifact:
"""Everything a runner persists for the report writer to consume.
``raw_path`` points at the JSONL of per-question ``ArmResult``
rows. ``metrics`` is a free-form dict the benchmark fills in (e.g.
``{"native": {...}, "surfsense": {...}, "delta": {...}}``).
"""
suite: str
benchmark: str
run_timestamp: str
raw_path: Path
metrics: dict[str, Any] = field(default_factory=dict)
extra: dict[str, Any] = field(default_factory=dict)
@dataclass
class ReportSection:
"""One benchmark's slice of the final summary."""
title: str
headline: bool
body_md: str
body_json: dict[str, Any] = field(default_factory=dict)
# ---------------------------------------------------------------------------
# Benchmark protocol + registry
# ---------------------------------------------------------------------------
@runtime_checkable
class Benchmark(Protocol):
"""The contract every benchmark module ends with ``register(<x>)``."""
suite: str
name: str
headline: bool
description: str
async def ingest(self, ctx: RunContext, **opts: Any) -> None: # pragma: no cover - protocol
...
async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact: # pragma: no cover - protocol
...
def add_run_args(self, parser: argparse.ArgumentParser) -> None: # pragma: no cover - protocol
"""Add benchmark-specific flags to ``run <suite> <benchmark>``."""
def report_section(self, artifacts: list[RunArtifact]) -> ReportSection: # pragma: no cover - protocol
...
# ---------------------------------------------------------------------------
# Registry storage
# ---------------------------------------------------------------------------
_REGISTRY: dict[tuple[str, str], Benchmark] = {}
def register(benchmark: Benchmark) -> None:
"""Add ``benchmark`` to the registry. Last-wins on duplicate keys.
Duplicate registrations log a warning rather than raising so a
benchmark module imported twice (once via auto-discovery, once via
a test directly importing it) doesn't blow up the CLI.
"""
key = (benchmark.suite, benchmark.name)
if key in _REGISTRY:
import logging
logging.getLogger(__name__).warning(
"Benchmark %s/%s re-registered (overwriting prior)", *key
)
_REGISTRY[key] = benchmark
def unregister(suite: str, name: str) -> None:
"""Test helper: drop a single benchmark from the registry."""
_REGISTRY.pop((suite, name), None)
def reset() -> None:
"""Test helper: wipe the registry (use with monkeypatched discovery)."""
_REGISTRY.clear()
def get(suite: str, name: str) -> Benchmark:
try:
return _REGISTRY[(suite, name)]
except KeyError as exc:
available = ", ".join(f"{s}/{n}" for s, n in sorted(_REGISTRY)) or "<none>"
raise KeyError(
f"Unknown benchmark '{suite}/{name}'. Registered: {available}"
) from exc
def list_suites() -> list[str]:
return sorted({s for s, _ in _REGISTRY})
def list_benchmarks(suite: str | None = None) -> list[Benchmark]:
if suite is None:
return [_REGISTRY[k] for k in sorted(_REGISTRY)]
return [_REGISTRY[k] for k in sorted(_REGISTRY) if k[0] == suite]
def snapshot() -> Mapping[tuple[str, str], Benchmark]:
"""Read-only view for diagnostics (e.g. ``benchmarks list`` rendering)."""
return dict(_REGISTRY)
__all__ = [
"Arm",
"Benchmark",
"ReportSection",
"RunArtifact",
"RunContext",
"get",
"list_benchmarks",
"list_suites",
"register",
"reset",
"snapshot",
"unregister",
]
# Re-export Arm from arms.base so suites can `from core.registry import Arm`.
from .arms.base import Arm # noqa: E402, F401 (deliberate re-export at bottom)

View file

@ -0,0 +1,18 @@
"""Report writer + section composition primitives. Lazy import."""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING: # pragma: no cover
from .writer import write_report
__all__ = ["write_report"]
def __getattr__(name: str):
if name == "write_report":
from .writer import write_report
return write_report
raise AttributeError(f"module 'surfsense_evals.core.report' has no attribute {name!r}")

View file

@ -0,0 +1,89 @@
"""Report writer — composes per-benchmark sections into one summary.
Output:
* ``reports/<suite>/<run-timestamp>/summary.md`` human-readable.
Bullet lists only (no tables) per project's coding-standards.
* ``reports/<suite>/<run-timestamp>/summary.json`` same content as
structured JSON for downstream tooling (CI dashboards, regressions).
Headline benchmarks come first in both outputs.
"""
from __future__ import annotations
import json
from collections.abc import Iterable
from pathlib import Path
from ..config import Config
from ..registry import ReportSection
def write_report(
*,
config: Config,
suite: str,
sections: Iterable[ReportSection],
run_timestamp: str,
) -> Path:
"""Write ``summary.md`` + ``summary.json``. Returns the path of the .md file."""
sections_list = list(sections)
sections_list.sort(key=lambda s: (not s.headline, s.title.lower()))
out_dir = config.suite_reports_dir(suite) / run_timestamp
out_dir.mkdir(parents=True, exist_ok=True)
md_path = out_dir / "summary.md"
json_path = out_dir / "summary.json"
md_lines: list[str] = [
f"# SurfSense evals — suite `{suite}`",
"",
f"- Run timestamp: `{run_timestamp}`",
f"- Sections: {len(sections_list)}",
"",
]
headline = [s for s in sections_list if s.headline]
secondary = [s for s in sections_list if not s.headline]
if headline:
md_lines.append("## Headline")
md_lines.append("")
for section in headline:
md_lines.append(f"### {section.title}")
md_lines.append("")
md_lines.append(section.body_md.rstrip())
md_lines.append("")
if secondary:
md_lines.append("## Secondary measurements")
md_lines.append("")
for section in secondary:
md_lines.append(f"### {section.title}")
md_lines.append("")
md_lines.append(section.body_md.rstrip())
md_lines.append("")
md_path.write_text("\n".join(md_lines).rstrip() + "\n", encoding="utf-8")
json_payload = {
"suite": suite,
"run_timestamp": run_timestamp,
"sections": [
{
"title": s.title,
"headline": s.headline,
"body_md": s.body_md,
"body_json": s.body_json,
}
for s in sections_list
],
}
json_path.write_text(
json.dumps(json_payload, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
return md_path
__all__ = ["ReportSection", "write_report"]

View file

@ -0,0 +1,58 @@
"""Shared scenario formatting helpers for head-to-head benchmark reports.
The scenario chosen at ``setup`` time (``head-to-head``, ``symmetric-cheap``,
``cost-arbitrage``) materially changes how a head-to-head report should be
read. This module produces the one-bullet summary every head-to-head
runner stamps near the top of its ``report_section`` body so reviewers
immediately see the framing no need to dig into ``run_artifact.json``.
"""
from __future__ import annotations
from collections.abc import Mapping
from typing import Any
def format_scenario_md(extra: Mapping[str, Any] | None) -> str:
"""Render a scenario-aware bullet for a benchmark report.
Reads ``extra["scenario"]`` plus the runtime LLM slugs the runner
recorded. Falls back to a sensible "head-to-head" line if the artifact
pre-dates scenarios so old runs still render cleanly.
"""
extra = dict(extra or {})
scenario = str(extra.get("scenario") or "head-to-head")
surf_slug = str(extra.get("provider_model") or "?")
native_slug = str(extra.get("native_arm_model") or surf_slug)
vision_slug = extra.get("vision_provider_model")
if scenario == "cost-arbitrage":
body = (
f"- Scenario: **cost-arbitrage** — native arm answers with "
f"`{native_slug}` (vision); SurfSense answers with `{surf_slug}` "
f"over chunks vision-extracted at ingest"
f"{f' by `{vision_slug}`' if vision_slug else ''}. "
"Measures how close SurfSense gets to native at a fraction of "
"the per-query cost."
)
elif scenario == "symmetric-cheap":
body = (
f"- Scenario: **symmetric-cheap** — both arms answer with "
f"`{surf_slug}`; SurfSense pre-extracted images at ingest"
f"{f' via `{vision_slug}`' if vision_slug else ''}. "
"Native arm structurally loses on image-bearing questions "
"(text-only model can't see images) — that's the point."
)
else:
body = (
f"- Scenario: head-to-head — both arms answer with `{surf_slug}` "
"via OpenRouter."
)
if vision_slug:
body += f" SurfSense ingest VLM: `{vision_slug}`."
return body
__all__ = ["format_scenario_md"]

View file

@ -0,0 +1,127 @@
"""Vision LLM resolution + auto-pick logic for the harness's ``setup`` command.
Two responsibilities:
1. Resolve an explicit ``--vision-llm <slug>`` to a global OpenRouter
vision LLM config id that ``set_llm_preferences(vision_llm_config_id=...)``
can accept.
2. Auto-pick the strongest registered vision config when the operator
doesn't pass ``--vision-llm`` but the scenario / benchmark needs one.
The priority list mirrors the recommended slugs in the README so the
auto-pick is deterministic and reviewable.
"""
from __future__ import annotations
from collections.abc import Iterable
from dataclasses import dataclass
from .clients.search_space import VisionLlmConfigEntry
# Order matters — first match wins when auto-picking. Keep these in sync
# with the "Recommended vision slugs" table in the README so the
# auto-pick story is the same one users read about.
RECOMMENDED_VISION_PRIORITY: tuple[str, ...] = (
"anthropic/claude-sonnet-4.5",
"anthropic/claude-opus-4.7",
"openai/gpt-5",
"google/gemini-2.5-pro",
)
class VisionConfigError(RuntimeError):
"""Raised when no vision config can be resolved (explicit or auto)."""
@dataclass(frozen=True)
class ResolvedVisionConfig:
"""Result of ``resolve_vision_llm`` — what to attach + a label for logs."""
config_id: int
provider_model: str
selected_via: str # "explicit" | "auto-priority" | "auto-fallback"
def _openrouter_only(entries: Iterable[VisionLlmConfigEntry]) -> list[VisionLlmConfigEntry]:
return [e for e in entries if e.provider == "OPENROUTER" and not e.is_auto_mode]
def resolve_vision_llm(
candidates: list[VisionLlmConfigEntry],
*,
explicit_slug: str | None,
) -> ResolvedVisionConfig:
"""Resolve a vision LLM config id from a slug or by auto-picking.
* If ``explicit_slug`` is given: must match exactly one OpenRouter
vision config's ``model_name``. Raises ``VisionConfigError`` with a
friendly listing if zero / many match.
* Otherwise: walk ``RECOMMENDED_VISION_PRIORITY`` in order and return
the first registered one. If none of the recommended slugs are
registered, fall back to the first OpenRouter vision config in the
list (deterministic by listing order). Raises ``VisionConfigError``
if zero are registered at all.
"""
or_vision = _openrouter_only(candidates)
if explicit_slug is not None:
matches = [e for e in or_vision if e.model_name == explicit_slug]
if not matches:
sample = ", ".join(e.model_name for e in or_vision[:8]) or "<none>"
raise VisionConfigError(
f"No OpenRouter vision config found for slug '{explicit_slug}'. "
"Make sure `openrouter_integration.vision_enabled: true` in "
"global_llm_config.yaml and that the Celery worker has finished "
"its first refresh. "
f"Available OpenRouter vision slugs (sample): {sample}."
)
if len(matches) > 1:
listing = "\n".join(f" id={e.id} name={e.name!r}" for e in matches)
raise VisionConfigError(
f"Multiple OpenRouter vision configs match '{explicit_slug}':\n{listing}"
)
only = matches[0]
return ResolvedVisionConfig(
config_id=only.id,
provider_model=only.model_name,
selected_via="explicit",
)
if not or_vision:
raise VisionConfigError(
"No OpenRouter vision LLM configs are registered with this "
"SurfSense backend. Either pass `--no-vision-llm` to the ingest "
"step (text-only ingestion), or enable "
"`openrouter_integration.vision_enabled: true` in "
"global_llm_config.yaml so the Celery worker syncs vision-capable "
"OpenRouter models on next refresh."
)
by_slug = {e.model_name: e for e in or_vision}
for preferred in RECOMMENDED_VISION_PRIORITY:
match = by_slug.get(preferred)
if match is not None:
return ResolvedVisionConfig(
config_id=match.id,
provider_model=match.model_name,
selected_via="auto-priority",
)
# Fallback: first registered OpenRouter vision config. Deterministic
# because the backend returns them in a stable order.
fallback = or_vision[0]
return ResolvedVisionConfig(
config_id=fallback.id,
provider_model=fallback.model_name,
selected_via="auto-fallback",
)
__all__ = [
"RECOMMENDED_VISION_PRIORITY",
"ResolvedVisionConfig",
"VisionConfigError",
"resolve_vision_llm",
]

Some files were not shown because too many files have changed in this diff Show more